#include "arm_arch.h"

#if __ARM_MAX_ARCH__>=8
.arch   armv8-a+crypto
.text
.globl  aes_gcm_enc_128_kernel
.type   aes_gcm_enc_128_kernel,%function
.align  4
aes_gcm_enc_128_kernel:
        cbz     x1, .L128_enc_ret
        stp     x19, x20, [sp, #-112]!
        mov     x16, x4
        mov     x8, x5
        stp     x21, x22, [sp, #16]
        stp     x23, x24, [sp, #32]
        stp     d8, d9, [sp, #48]
        stp     d10, d11, [sp, #64]
        stp     d12, d13, [sp, #80]
        stp     d14, d15, [sp, #96]

        ldp     x10, x11, [x16]              //ctr96_b64, ctr96_t32
#ifdef __AARCH64EB__
        rev     x10, x10
        rev     x11, x11
#endif
        ldp     x13, x14, [x8, #160]                     //load rk10
#ifdef __AARCH64EB__
        ror     x13, x13, #32
        ror     x14, x14, #32
#endif
        ld1     {v11.16b}, [x3]
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b
        lsr     x5, x1, #3              //byte_len
        mov     x15, x5

        ld1     {v18.4s}, [x8], #16                                                               //load rk0
        add     x4, x0, x1, lsr #3   //end_input_ptr
        sub     x5, x5, #1      //byte_len - 1

        lsr     x12, x11, #32
        ldr     q15, [x3, #112]                        //load h4l | h4h
#ifndef __AARCH64EB__
        ext     v15.16b, v15.16b, v15.16b, #8
#endif
        fmov    d1, x10                               //CTR block 1
        rev     w12, w12                                //rev_ctr32

        add     w12, w12, #1                            //increment rev_ctr32
        orr     w11, w11, w11
        ld1     {v19.4s}, [x8], #16                                                               //load rk1

        rev     w9, w12                                 //CTR block 1
        add     w12, w12, #1                            //CTR block 1
        fmov    d3, x10                               //CTR block 3

        orr     x9, x11, x9, lsl #32            //CTR block 1
        ld1     { v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible

        fmov    v1.d[1], x9                               //CTR block 1
        rev     w9, w12                                 //CTR block 2

        fmov    d2, x10                               //CTR block 2
        orr     x9, x11, x9, lsl #32            //CTR block 2
        add     w12, w12, #1                            //CTR block 2

        fmov    v2.d[1], x9                               //CTR block 2
        rev     w9, w12                                 //CTR block 3

        orr     x9, x11, x9, lsl #32            //CTR block 3
        ld1     {v20.4s}, [x8], #16                                                               //load rk2

        add     w12, w12, #1                            //CTR block 3
        fmov    v3.d[1], x9                               //CTR block 3

        ldr     q14, [x3, #80]                         //load h3l | h3h
#ifndef __AARCH64EB__
        ext     v14.16b, v14.16b, v14.16b, #8
#endif
        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 0
        ld1     {v21.4s}, [x8], #16                                                               //load rk3

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 0
        ldr     q12, [x3, #32]                         //load h1l | h1h
#ifndef __AARCH64EB__
        ext     v12.16b, v12.16b, v12.16b, #8
#endif

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 0
        ld1     {v22.4s}, [x8], #16                                                               //load rk4

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 0
        ld1     {v23.4s}, [x8], #16                                                               //load rk5

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 1
        trn2    v17.2d,  v14.2d,    v15.2d                      //h4l | h3l

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 1
        ld1     {v24.4s}, [x8], #16                                                               //load rk6

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 1
        ld1     {v25.4s}, [x8], #16                                                               //load rk7

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 1
        trn1    v9.2d, v14.2d,    v15.2d                      //h4h | h3h

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 2
        ld1     {v26.4s}, [x8], #16                                                               //load rk8

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 2
        ldr     q13, [x3, #64]                         //load h2l | h2h
#ifndef __AARCH64EB__
        ext     v13.16b, v13.16b, v13.16b, #8
#endif

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 2

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 2
        eor     v17.16b, v17.16b, v9.16b                  //h4k | h3k

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 3

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 3

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 3
        ld1     {v27.4s}, [x8], #16                                                               //load rk9

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 3

        and     x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
        trn2    v16.2d,  v12.2d,    v13.2d                      //h2l | h1l

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 4
        add     x5, x5, x0

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 4
        cmp     x0, x5                   //check if we have <= 4 blocks

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 4

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 5

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 5

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 5

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 6

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 4

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 6
        trn1    v8.2d,    v12.2d,    v13.2d                      //h2h | h1h

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 6

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 5

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 7

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 7

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 6

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 7

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 8

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 7

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 8

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 8

        aese    v2.16b, v27.16b                                      //AES block 2 - round 9

        aese    v0.16b, v27.16b                                      //AES block 0 - round 9

        eor     v16.16b, v16.16b, v8.16b                     //h2k | h1k

        aese    v1.16b, v27.16b                                      //AES block 1 - round 9

        aese    v3.16b, v27.16b                                      //AES block 3 - round 9
        b.ge    .L128_enc_tail                                    //handle tail

        ldp     x6, x7, [x0, #0]            //AES block 0 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        ldp     x21, x22, [x0, #32]           //AES block 2 - load plaintext
#ifdef __AARCH64EB__
        rev     x21, x21
        rev     x22, x22
#endif
        ldp     x19, x20, [x0, #16]           //AES block 1 - load plaintext
#ifdef __AARCH64EB__
        rev     x19, x19
        rev     x20, x20
#endif
        ldp     x23, x24, [x0, #48]           //AES block 3 - load plaintext
#ifdef __AARCH64EB__
        rev     x23, x23
        rev     x24, x24
#endif
        eor     x6, x6, x13                     //AES block 0 - round 10 low
        eor     x7, x7, x14                     //AES block 0 - round 10 high

        eor     x21, x21, x13                     //AES block 2 - round 10 low
        fmov    d4, x6                               //AES block 0 - mov low

        eor     x19, x19, x13                     //AES block 1 - round 10 low
        eor     x22, x22, x14                     //AES block 2 - round 10 high
        fmov    v4.d[1], x7                           //AES block 0 - mov high

        fmov    d5, x19                               //AES block 1 - mov low
        eor     x20, x20, x14                     //AES block 1 - round 10 high

        eor     x23, x23, x13                     //AES block 3 - round 10 low
        fmov    v5.d[1], x20                           //AES block 1 - mov high

        fmov    d6, x21                               //AES block 2 - mov low
        eor     x24, x24, x14                     //AES block 3 - round 10 high
        rev     w9, w12                                 //CTR block 4

        fmov    v6.d[1], x22                           //AES block 2 - mov high
        orr     x9, x11, x9, lsl #32            //CTR block 4

        eor     v4.16b, v4.16b, v0.16b                          //AES block 0 - result
        fmov    d0, x10                               //CTR block 4
        add     w12, w12, #1                            //CTR block 4

        fmov    v0.d[1], x9                               //CTR block 4
        rev     w9, w12                                 //CTR block 5

        eor     v5.16b, v5.16b, v1.16b                          //AES block 1 - result
        fmov    d1, x10                               //CTR block 5
        orr     x9, x11, x9, lsl #32            //CTR block 5

        add     w12, w12, #1                            //CTR block 5
        add     x0, x0, #64                       //AES input_ptr update
        fmov    v1.d[1], x9                               //CTR block 5

        fmov    d7, x23                               //AES block 3 - mov low
        rev     w9, w12                                 //CTR block 6
        st1     { v4.16b}, [x2], #16                     //AES block 0 - store result

        fmov    v7.d[1], x24                           //AES block 3 - mov high
        orr     x9, x11, x9, lsl #32            //CTR block 6

        add     w12, w12, #1                            //CTR block 6
        eor     v6.16b, v6.16b, v2.16b                          //AES block 2 - result
        st1     { v5.16b}, [x2], #16                     //AES block 1 - store result

        fmov    d2, x10                               //CTR block 6
        cmp     x0, x5                   //check if we have <= 8 blocks

        fmov    v2.d[1], x9                               //CTR block 6
        rev     w9, w12                                 //CTR block 7
        st1     { v6.16b}, [x2], #16                     //AES block 2 - store result

        orr     x9, x11, x9, lsl #32            //CTR block 7

        eor     v7.16b, v7.16b, v3.16b                          //AES block 3 - result
        st1     { v7.16b}, [x2], #16                     //AES block 3 - store result
        b.ge    .L128_enc_prepretail                              //do prepretail

.L128_enc_main_loop:    //main  loop start
        ldp     x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
#ifdef __AARCH64EB__
        rev     x23, x23
        rev     x24, x24
#endif
        rev64   v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        fmov    d3, x10                               //CTR block 4k+3

        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
        rev64   v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        add     w12, w12, #1                            //CTR block 4k+3
        fmov    v3.d[1], x9                               //CTR block 4k+3

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        mov     d31, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        mov     d30, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        eor     v4.16b, v4.16b, v11.16b                           //PRE 1

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        eor     x24, x24, x14                     //AES block 4k+3 - round 10 high

        pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
        eor     v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
        ldp     x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        rev     w9, w12                                 //CTR block 4k+8

        eor     v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid
        orr     x9, x11, x9, lsl #32            //CTR block 4k+8

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        add     w12, w12, #1                            //CTR block 4k+8
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
        eor     v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high

        pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)

        pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid

        pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
        ins     v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid

        pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
        eor     x7, x7, x14                     //AES block 4k+4 - round 10 high

        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
        mov     d30, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
        eor     x6, x6, x13                     //AES block 4k+4 - round 10 low

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
        eor     v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid

        pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3
        eor     v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high

        pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid

        pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
        movi    v8.8b, #0xc2

        pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
        eor     v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        shl     d8, d8, #56               //mod_constant

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5
        ldp     x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
#ifdef __AARCH64EB__
        rev     x19, x19
        rev     x20, x20
#endif
        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
        eor     v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        ldp     x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
#ifdef __AARCH64EB__
        rev     x21, x21
        rev     x22, x22
#endif
        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
        eor     x19, x19, x13                     //AES block 4k+5 - round 10 low

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4
        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
        eor     x23, x23, x13                     //AES block 4k+3 - round 10 low

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        fmov    d4, x6                               //AES block 4k+4 - mov low
        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
        fmov    v4.d[1], x7                           //AES block 4k+4 - mov high

        add     x0, x0, #64                       //AES input_ptr update
        fmov    d7, x23                               //AES block 4k+3 - mov low
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
        fmov    d5, x19                               //AES block 4k+5 - mov low

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
        eor     x20, x20, x14                     //AES block 4k+5 - round 10 high

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
        fmov    v5.d[1], x20                           //AES block 4k+5 - mov high

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        fmov    v7.d[1], x24                           //AES block 4k+3 - mov high

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
        cmp     x0, x5                   //.LOOP CONTROL

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid

        aese    v0.16b, v27.16b                                      //AES block 4k+4 - round 9
        eor     x21, x21, x13                     //AES block 4k+6 - round 10 low
        eor     x22, x22, x14                     //AES block 4k+6 - round 10 high

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
        fmov    d6, x21                               //AES block 4k+6 - mov low

        aese    v1.16b, v27.16b                                      //AES block 4k+5 - round 9
        fmov    v6.d[1], x22                           //AES block 4k+6 - mov high

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        eor     v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result

        fmov    d0, x10                               //CTR block 4k+8
        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8

        fmov    v0.d[1], x9                               //CTR block 4k+8
        rev     w9, w12                                 //CTR block 4k+9
        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
        eor     v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result

        add     w12, w12, #1                            //CTR block 4k+9
        orr     x9, x11, x9, lsl #32            //CTR block 4k+9
        fmov    d1, x10                               //CTR block 4k+9

        pmull   v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
        fmov    v1.d[1], x9                               //CTR block 4k+9
        rev     w9, w12                                 //CTR block 4k+10

        aese    v2.16b, v27.16b                                      //AES block 4k+6 - round 9
        st1     { v4.16b}, [x2], #16                     //AES block 4k+4 - store result
        eor     v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
        orr     x9, x11, x9, lsl #32            //CTR block 4k+10

        aese    v3.16b, v27.16b                                      //AES block 4k+7 - round 9
        add     w12, w12, #1                            //CTR block 4k+10
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
        fmov    d2, x10                               //CTR block 4k+10

        eor     v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
        st1     { v5.16b}, [x2], #16                     //AES block 4k+5 - store result

        fmov    v2.d[1], x9                               //CTR block 4k+10
        st1     { v6.16b}, [x2], #16                     //AES block 4k+6 - store result
        rev     w9, w12                                 //CTR block 4k+11

        orr     x9, x11, x9, lsl #32            //CTR block 4k+11
        eor     v7.16b, v7.16b, v3.16b                          //AES block 4k+3 - result

        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        st1     { v7.16b}, [x2], #16                     //AES block 4k+3 - store result
        b.lt    .L128_enc_main_loop

.L128_enc_prepretail:   //PREPRETAIL
        rev64   v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
        fmov    d3, x10                               //CTR block 4k+3
        rev64   v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)

        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
        add     w12, w12, #1                            //CTR block 4k+3
        fmov    v3.d[1], x9                               //CTR block 4k+3

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)

        pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low

        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
        eor     v4.16b, v4.16b, v11.16b                           //PRE 1

        pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        mov     d30, v5.d[1]                                  //GHASH block 4k+1 - mid

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid

        mov     d31, v6.d[1]                                  //GHASH block 4k+2 - mid
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        eor     v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid

        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        eor     v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1

        pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        ins     v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0

        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
        mov     d30, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        eor     v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high

        pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid

        pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
        eor     v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid

        pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high

        pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        eor     v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2

        pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
        movi    v8.8b, #0xc2

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
        eor     v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2

        pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
        eor     v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3

        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
        shl     d8, d8, #56               //mod_constant

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4

        pmull   v28.1q, v9.1d, v8.1d
        eor     v10.16b, v10.16b, v9.16b                         //karatsuba tidy up

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        ext     v9.16b, v9.16b, v9.16b, #8

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
        eor     v10.16b, v10.16b, v11.16b

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
        eor     v10.16b, v10.16b, v28.16b

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
        eor     v10.16b, v10.16b, v9.16b

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7

        pmull   v28.1q, v10.1d, v8.1d

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
        ext     v10.16b, v10.16b, v10.16b, #8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        eor     v11.16b, v11.16b, v28.16b

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8

        aese    v3.16b, v27.16b                                      //AES block 4k+7 - round 9

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8

        aese    v0.16b, v27.16b                                      //AES block 4k+4 - round 9

        aese    v1.16b, v27.16b                                      //AES block 4k+5 - round 9
        eor     v11.16b, v11.16b, v10.16b

        aese    v2.16b, v27.16b                                      //AES block 4k+6 - round 9
.L128_enc_tail: //TAIL

        sub     x5, x4, x0   //main_end_input_ptr is number of bytes left to process
        ldp     x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        cmp     x5, #48

        ext     v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
        eor     x6, x6, x13                     //AES block 4k+4 - round 10 low
        eor     x7, x7, x14                     //AES block 4k+4 - round 10 high

        fmov    d4, x6                               //AES block 4k+4 - mov low

        fmov    v4.d[1], x7                           //AES block 4k+4 - mov high

        eor     v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result

        b.gt    .L128_enc_blocks_more_than_3

        sub     w12, w12, #1
        movi    v11.8b, #0
        mov     v3.16b, v2.16b

        cmp     x5, #32
        mov     v2.16b, v1.16b
        movi    v9.8b, #0

        movi    v10.8b, #0
        b.gt    .L128_enc_blocks_more_than_2

        mov     v3.16b, v1.16b
        cmp     x5, #16

        sub     w12, w12, #1
        b.gt    .L128_enc_blocks_more_than_1

        sub     w12, w12, #1
        b       .L128_enc_blocks_less_than_1
.L128_enc_blocks_more_than_3:   //blocks        left >  3
        st1     { v5.16b}, [x2], #16                     //AES final-3 block  - store result

        ldp     x6, x7, [x0], #16           //AES final-2 block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        rev64   v4.16b, v5.16b                                    //GHASH final-3 block

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag
        eor     x7, x7, x14                     //AES final-2 block - round 10 high
        eor     x6, x6, x13                     //AES final-2 block - round 10 low

        fmov    d5, x6                                 //AES final-2 block - mov low

        movi    v8.8b, #0                                        //suppress further partial tag feed in
        fmov    v5.d[1], x7                             //AES final-2 block - mov high

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
        mov     d22, v4.d[1]                                 //GHASH final-3 block - mid

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high

        mov     d10, v17.d[1]                               //GHASH final-3 block - mid

        eor     v5.16b, v5.16b, v1.16b                            //AES final-2 block - result
        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid

        pmull   v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
.L128_enc_blocks_more_than_2:   //blocks        left >  2

        st1     { v5.16b}, [x2], #16                     //AES final-2 block - store result

        rev64   v4.16b, v5.16b                                    //GHASH final-2 block
        ldp     x6, x7, [x0], #16           //AES final-1 block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        eor     x6, x6, x13                     //AES final-1 block - round 10 low

        fmov    d5, x6                                 //AES final-1 block - mov low
        eor     x7, x7, x14                     //AES final-1 block - round 10 high

        pmull2  v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
        fmov    v5.d[1], x7                             //AES final-1 block - mov high

        mov     d22, v4.d[1]                                 //GHASH final-2 block - mid

        pmull   v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high

        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid

        eor     v5.16b, v5.16b, v2.16b                            //AES final-1 block - result

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low

        pmull   v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid

        movi    v8.8b, #0                                        //suppress further partial tag feed in

        eor     v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
.L128_enc_blocks_more_than_1:   //blocks        left >  1

        st1     { v5.16b}, [x2], #16                     //AES final-1 block - store result

        rev64   v4.16b, v5.16b                                    //GHASH final-1 block
        ldp     x6, x7, [x0], #16           //AES final block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        eor     x7, x7, x14                     //AES final block - round 10 high
        eor     x6, x6, x13                     //AES final block - round 10 low

        fmov    d5, x6                                 //AES final block - mov low

        pmull2  v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
        fmov    v5.d[1], x7                             //AES final block - mov high

        mov     d22, v4.d[1]                                 //GHASH final-1 block - mid

        pmull   v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low

        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid

        eor     v5.16b, v5.16b, v3.16b                            //AES final block - result

        ins     v22.d[1], v22.d[0]                            //GHASH final-1 block - mid

        pmull2  v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high

        eor     v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
        movi    v8.8b, #0                                        //suppress further partial tag feed in
.L128_enc_blocks_less_than_1:   //blocks        left <= 1

        and     x1, x1, #127                    //bit_length %= 128
        mvn     x13, xzr                                      //rk10_l = 0xffffffffffffffff

        mvn     x14, xzr                                      //rk10_h = 0xffffffffffffffff
        sub     x1, x1, #128                    //bit_length -= 128

        neg     x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])

        and     x1, x1, #127                    //bit_length %= 128

        lsr     x14, x14, x1                     //rk10_h is mask for top 64b of last block
        cmp     x1, #64

        csel    x6, x13, x14, lt
        csel    x7, x14, xzr, lt

        fmov    d0, x6                                 //ctr0b is mask for last block

        fmov    v0.d[1], x7

        and     v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits

        rev64   v4.16b, v5.16b                                    //GHASH final block

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        mov     d8, v4.d[1]                                  //GHASH final block - mid

        pmull   v21.1q, v4.1d, v12.1d                          //GHASH final block - low
        ld1     { v18.16b}, [x2]                            //load existing bytes where the possibly partial last block is to be stored

        eor     v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
#ifndef __AARCH64EB__
        rev     w9, w12
#else
        mov     w9, w12
#endif
        pmull2  v20.1q, v4.2d, v12.2d                          //GHASH final block - high

        pmull   v8.1q, v8.1d, v16.1d                          //GHASH final block - mid

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final block - low

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final block - high

        eor     v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
        movi    v8.8b, #0xc2

        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        shl     d8, d8, #56               //mod_constant

        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid

        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid

        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        pmull   v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low

        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        bif     v5.16b, v18.16b, v0.16b                              //insert existing bytes in top end of result before storing

        eor     v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
        st1     { v5.16b}, [x2]                          //store all 16B

        str     w9, [x16, #12]                          //store the updated counter

        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b
        mov     x0, x15
        st1     { v11.16b }, [x3]
        ldp     x21, x22, [sp, #16]
        ldp     x23, x24, [sp, #32]
        ldp     d8, d9, [sp, #48]
        ldp     d10, d11, [sp, #64]
        ldp     d12, d13, [sp, #80]
        ldp     d14, d15, [sp, #96]
        ldp     x19, x20, [sp], #112
        ret

.L128_enc_ret:
        mov     w0, #0x0
        ret
.size   aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
.globl  aes_gcm_dec_128_kernel
.type   aes_gcm_dec_128_kernel,%function
.align  4
aes_gcm_dec_128_kernel:
        cbz     x1, .L128_dec_ret
        stp     x19, x20, [sp, #-112]!
        mov     x16, x4
        mov     x8, x5
        stp     x21, x22, [sp, #16]
        stp     x23, x24, [sp, #32]
        stp     d8, d9, [sp, #48]
        stp     d10, d11, [sp, #64]
        stp     d12, d13, [sp, #80]
        stp     d14, d15, [sp, #96]

        lsr     x5, x1, #3              //byte_len
        mov     x15, x5
        ldp     x10, x11, [x16]              //ctr96_b64, ctr96_t32
#ifdef __AARCH64EB__
        rev     x10, x10
        rev     x11, x11
#endif
        ldp     x13, x14, [x8, #160]                     //load rk10
#ifdef __AARCH64EB__
        ror     x14, x14, 32
        ror     x13, x13, 32
#endif
        sub     x5, x5, #1      //byte_len - 1
        ld1     {v18.4s}, [x8], #16                                //load rk0

        and     x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
        ld1     { v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible

        ldr     q13, [x3, #64]                         //load h2l | h2h
#ifndef __AARCH64EB__
        ext     v13.16b, v13.16b, v13.16b, #8
#endif
        lsr     x12, x11, #32
        fmov    d2, x10                               //CTR block 2

        ld1     {v19.4s}, [x8], #16                                //load rk1
        orr     w11, w11, w11
        rev     w12, w12                                //rev_ctr32

        fmov    d1, x10                               //CTR block 1
        add     w12, w12, #1                            //increment rev_ctr32

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 0
        rev     w9, w12                                 //CTR block 1

        orr     x9, x11, x9, lsl #32            //CTR block 1
        ld1     {v20.4s}, [x8], #16                                //load rk2
        add     w12, w12, #1                            //CTR block 1

        fmov    v1.d[1], x9                               //CTR block 1
        rev     w9, w12                                 //CTR block 2
        add     w12, w12, #1                            //CTR block 2

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 1
        orr     x9, x11, x9, lsl #32            //CTR block 2

        fmov    v2.d[1], x9                               //CTR block 2
        rev     w9, w12                                 //CTR block 3

        fmov    d3, x10                               //CTR block 3
        orr     x9, x11, x9, lsl #32            //CTR block 3
        add     w12, w12, #1                            //CTR block 3

        fmov    v3.d[1], x9                               //CTR block 3
        add     x4, x0, x1, lsr #3   //end_input_ptr

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 0
        ld1     {v21.4s}, [x8], #16                                //load rk3

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 2
        ld1     {v22.4s}, [x8], #16                                //load rk4

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 0
        ld1     {v23.4s}, [x8], #16                                //load rk5

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 1
        ld1     {v24.4s}, [x8], #16                                //load rk6

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 0

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 1

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 2

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 1
        ld1     { v11.16b}, [x3]
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 3
        ld1     {v25.4s}, [x8], #16                                //load rk7

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 3

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 2

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 2
        ld1     {v26.4s}, [x8], #16                                //load rk8

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 4

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 3

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 3
        ldr     q14, [x3, #80]                         //load h3l | h3h
#ifndef __AARCH64EB__
        ext     v14.16b, v14.16b, v14.16b, #8
#endif
        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 4
        ld1     {v27.4s}, [x8], #16                                //load rk9

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 5

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 4

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 4

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 5

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 5
        ldr     q12, [x3, #32]                         //load h1l | h1h
#ifndef __AARCH64EB__
        ext     v12.16b, v12.16b, v12.16b, #8
#endif
        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 5

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 6

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 6

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 6

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 6
        trn1    v8.2d,    v12.2d,    v13.2d                      //h2h | h1h

        ldr     q15, [x3, #112]                        //load h4l | h4h
#ifndef __AARCH64EB__
        ext     v15.16b, v15.16b, v15.16b, #8
#endif
        trn2    v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
        add     x5, x5, x0

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 7

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 7

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 7
        eor     v16.16b, v16.16b, v8.16b                     //h2k | h1k

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 7

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 8
        trn2    v17.2d,  v14.2d,    v15.2d                      //h4l | h3l

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 8

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 8
        trn1    v9.2d, v14.2d,    v15.2d                      //h4h | h3h

        aese    v2.16b, v27.16b                                      //AES block 2 - round 9

        aese    v3.16b, v27.16b                                      //AES block 3 - round 9

        aese    v0.16b, v27.16b                                      //AES block 0 - round 9
        cmp     x0, x5                   //check if we have <= 4 blocks

        aese    v1.16b, v27.16b                                      //AES block 1 - round 9
        eor     v17.16b, v17.16b, v9.16b                  //h4k | h3k
        b.ge    .L128_dec_tail                                    //handle tail

        ld1     {v4.16b, v5.16b}, [x0], #32               //AES block 0 - load ciphertext; AES block 1 - load ciphertext

        eor     v1.16b, v5.16b, v1.16b                            //AES block 1 - result
        ld1     {v6.16b}, [x0], #16                       //AES block 2 - load ciphertext

        eor     v0.16b, v4.16b, v0.16b                            //AES block 0 - result
        rev64   v4.16b, v4.16b                                    //GHASH block 0
        rev     w9, w12                                 //CTR block 4

        orr     x9, x11, x9, lsl #32            //CTR block 4
        add     w12, w12, #1                            //CTR block 4
        ld1     {v7.16b}, [x0], #16                       //AES block 3 - load ciphertext

        rev64   v5.16b, v5.16b                                    //GHASH block 1
        mov     x19, v1.d[0]                            //AES block 1 - mov low

        mov     x20, v1.d[1]                            //AES block 1 - mov high

        mov     x6, v0.d[0]                            //AES block 0 - mov low
        cmp     x0, x5                   //check if we have <= 8 blocks

        mov     x7, v0.d[1]                            //AES block 0 - mov high

        fmov    d0, x10                               //CTR block 4

        fmov    v0.d[1], x9                               //CTR block 4
        rev     w9, w12                                 //CTR block 5
        eor     x19, x19, x13                   //AES block 1 - round 10 low
#ifdef __AARCH64EB__
        rev     x19, x19
#endif
        fmov    d1, x10                               //CTR block 5
        add     w12, w12, #1                            //CTR block 5
        orr     x9, x11, x9, lsl #32            //CTR block 5

        fmov    v1.d[1], x9                               //CTR block 5
        rev     w9, w12                                 //CTR block 6
        add     w12, w12, #1                            //CTR block 6

        orr     x9, x11, x9, lsl #32            //CTR block 6

        eor     x20, x20, x14                   //AES block 1 - round 10 high
#ifdef __AARCH64EB__
        rev     x20, x20
#endif
        eor     x6, x6, x13                   //AES block 0 - round 10 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     v2.16b, v6.16b, v2.16b                            //AES block 2 - result

        eor     x7, x7, x14                   //AES block 0 - round 10 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        stp     x6, x7, [x2], #16        //AES block 0 - store result

        stp     x19, x20, [x2], #16        //AES block 1 - store result
        b.ge    .L128_dec_prepretail                              //do prepretail

.L128_dec_main_loop:    //main  loop start
        eor     v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
        mov     x21, v2.d[0]                            //AES block 4k+2 - mov low

        pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
        mov     x22, v2.d[1]                            //AES block 4k+2 - mov high

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        fmov    d2, x10                               //CTR block 4k+6

        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2
        fmov    v2.d[1], x9                               //CTR block 4k+6
        rev     w9, w12                                 //CTR block 4k+7

        mov     x23, v3.d[0]                            //AES block 4k+3 - mov low
        eor     v4.16b, v4.16b, v11.16b                           //PRE 1
        mov     d30, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3

        pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
        mov     x24, v3.d[1]                            //AES block 4k+3 - mov high
        orr     x9, x11, x9, lsl #32            //CTR block 4k+7

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        fmov    d3, x10                               //CTR block 4k+7
        eor     v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        fmov    v3.d[1], x9                               //CTR block 4k+7

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low

        pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        eor     v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0

        pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
        eor     x23, x23, x13                   //AES block 4k+3 - round 10 low
#ifdef __AARCH64EB__
        rev     x23, x23
#endif
        pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
        eor     x22, x22, x14                   //AES block 4k+2 - round 10 high
#ifdef __AARCH64EB__
        rev     x22, x22
#endif
        mov     d31, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        eor     v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        eor     v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4
        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid

        pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
        ins     v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid

        pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        mov     d30, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
        eor     v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high

        pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
        eor     x24, x24, x14                   //AES block 4k+3 - round 10 high
#ifdef __AARCH64EB__
        rev     x24, x24
#endif
        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
        eor     v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5
        eor     x21, x21, x13                   //AES block 4k+2 - round 10 low
#ifdef __AARCH64EB__
        rev     x21, x21
#endif
        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        movi    v8.8b, #0xc2

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
        eor     v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
        stp     x21, x22, [x2], #16        //AES block 4k+2 - store result

        pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
        ld1     {v4.16b}, [x0], #16                       //AES block 4k+3 - load ciphertext

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
        add     w12, w12, #1                            //CTR block 4k+7

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
        shl     d8, d8, #56               //mod_constant

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
        stp     x23, x24, [x2], #16        //AES block 4k+3 - store result

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
        rev     w9, w12                                 //CTR block 4k+8

        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
        ld1     {v5.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v0.16b, v27.16b                                      //AES block 4k+4 - round 9
        orr     x9, x11, x9, lsl #32            //CTR block 4k+8

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4
        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        aese    v1.16b, v27.16b                                      //AES block 4k+5 - round 9

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
        eor     v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
        ld1     {v6.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext

        add     w12, w12, #1                            //CTR block 4k+8
        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
        eor     v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        ld1     {v7.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6

        rev64   v5.16b, v5.16b                                    //GHASH block 4k+5
        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
        mov     x7, v0.d[1]                            //AES block 4k+4 - mov high

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
        mov     x6, v0.d[0]                            //AES block 4k+4 - mov low

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
        fmov    d0, x10                               //CTR block 4k+8

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
        fmov    v0.d[1], x9                               //CTR block 4k+8
        rev     w9, w12                                 //CTR block 4k+9

        aese    v2.16b, v27.16b                                      //AES block 4k+6 - round 9
        orr     x9, x11, x9, lsl #32            //CTR block 4k+9
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
        eor     x7, x7, x14                   //AES block 4k+4 - round 10 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low
        mov     x20, v1.d[1]                            //AES block 4k+5 - mov high
        eor     x6, x6, x13                   //AES block 4k+4 - round 10 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
        mov     x19, v1.d[0]                            //AES block 4k+5 - mov low
        add     w12, w12, #1                            //CTR block 4k+9

        aese    v3.16b, v27.16b                                      //AES block 4k+7 - round 9
        fmov    d1, x10                               //CTR block 4k+9
        cmp     x0, x5                   //.LOOP CONTROL

        rev64   v4.16b, v4.16b                                    //GHASH block 4k+4
        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        fmov    v1.d[1], x9                               //CTR block 4k+9

        rev     w9, w12                                 //CTR block 4k+10
        add     w12, w12, #1                            //CTR block 4k+10

        eor     x20, x20, x14                   //AES block 4k+5 - round 10 high
#ifdef __AARCH64EB__
        rev     x20, x20
#endif
        stp     x6, x7, [x2], #16        //AES block 4k+4 - store result

        eor     x19, x19, x13                   //AES block 4k+5 - round 10 low
#ifdef __AARCH64EB__
        rev     x19, x19
#endif
        stp     x19, x20, [x2], #16        //AES block 4k+5 - store result

        orr     x9, x11, x9, lsl #32            //CTR block 4k+10
        b.lt    .L128_dec_main_loop

.L128_dec_prepretail:   //PREPRETAIL
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
        mov     x21, v2.d[0]                            //AES block 4k+2 - mov low
        mov     d30, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        eor     v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        mov     x22, v2.d[1]                            //AES block 4k+2 - mov high

        eor     v4.16b, v4.16b, v11.16b                           //PRE 1
        fmov    d2, x10                               //CTR block 4k+6
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        fmov    v2.d[1], x9                               //CTR block 4k+6

        rev     w9, w12                                 //CTR block 4k+7
        mov     x23, v3.d[0]                            //AES block 4k+3 - mov low
        eor     v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        mov     d10, v17.d[1]                               //GHASH block 4k - mid
        mov     x24, v3.d[1]                            //AES block 4k+3 - mov high

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        mov     d31, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
        orr     x9, x11, x9, lsl #32            //CTR block 4k+7

        pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid
        fmov    d3, x10                               //CTR block 4k+7

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        fmov    v3.d[1], x9                               //CTR block 4k+7

        pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
        eor     v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid

        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        ins     v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid

        pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low

        pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low

        pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
        eor     v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high

        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid

        pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high

        pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
        mov     d30, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        eor     v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid

        pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        eor     v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
        movi    v8.8b, #0xc2

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
        eor     v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid

        eor     v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        eor     x23, x23, x13                   //AES block 4k+3 - round 10 low
#ifdef __AARCH64EB__
        rev     x23, x23
#endif
        pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
        eor     x21, x21, x13                   //AES block 4k+2 - round 10 low
#ifdef __AARCH64EB__
        rev     x21, x21
#endif
        eor     v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
        shl     d8, d8, #56               //mod_constant

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7

        aese    v1.16b, v27.16b                                      //AES block 4k+5 - round 9

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
        eor     x24, x24, x14                   //AES block 4k+3 - round 10 high
#ifdef __AARCH64EB__
        rev     x24, x24
#endif
        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
        eor     x22, x22, x14                   //AES block 4k+2 - round 10 high
#ifdef __AARCH64EB__
        rev     x22, x22
#endif
        aese    v0.16b, v27.16b                                      //AES block 4k+4 - round 9
        stp     x21, x22, [x2], #16        //AES block 4k+2 - store result

        aese    v2.16b, v27.16b                                      //AES block 4k+6 - round 9
        add     w12, w12, #1                            //CTR block 4k+7
        stp     x23, x24, [x2], #16        //AES block 4k+3 - store result

        aese    v3.16b, v27.16b                                      //AES block 4k+7 - round 9
        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
.L128_dec_tail: //TAIL

        sub     x5, x4, x0   //main_end_input_ptr is number of bytes left to process
        ld1     { v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext

        eor     v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result

        mov     x7, v0.d[1]                            //AES block 4k+4 - mov high

        mov     x6, v0.d[0]                            //AES block 4k+4 - mov low

        cmp     x5, #48

        eor     x7, x7, x14                   //AES block 4k+4 - round 10 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        ext     v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
        eor     x6, x6, x13                   //AES block 4k+4 - round 10 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        b.gt    .L128_dec_blocks_more_than_3

        mov     v3.16b, v2.16b
        sub     w12, w12, #1
        movi    v11.8b, #0

        movi    v9.8b, #0
        mov     v2.16b, v1.16b

        movi    v10.8b, #0
        cmp     x5, #32
        b.gt    .L128_dec_blocks_more_than_2

        cmp     x5, #16

        mov     v3.16b, v1.16b
        sub     w12, w12, #1
        b.gt    .L128_dec_blocks_more_than_1

        sub     w12, w12, #1
        b       .L128_dec_blocks_less_than_1
.L128_dec_blocks_more_than_3:   //blocks        left >  3
        rev64   v4.16b, v5.16b                                    //GHASH final-3 block
        ld1     { v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        mov     d10, v17.d[1]                               //GHASH final-3 block - mid
        stp     x6, x7, [x2], #16        //AES final-3 block  - store result
        eor     v0.16b, v5.16b, v1.16b                            //AES final-2 block - result

        mov     d22, v4.d[1]                                 //GHASH final-3 block - mid
        mov     x7, v0.d[1]                            //AES final-2 block - mov high

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
        mov     x6, v0.d[0]                            //AES final-2 block - mov low

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high

        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid

        movi    v8.8b, #0                                        //suppress further partial tag feed in
        eor     x7, x7, x14                   //AES final-2 block - round 10 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        pmull   v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
        eor     x6, x6, x13                   //AES final-2 block - round 10 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
.L128_dec_blocks_more_than_2:   //blocks        left >  2

        rev64   v4.16b, v5.16b                                    //GHASH final-2 block
        ld1     { v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        eor     v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
        stp     x6, x7, [x2], #16        //AES final-2 block  - store result

        mov     d22, v4.d[1]                                 //GHASH final-2 block - mid

        pmull   v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low

        pmull2  v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
        mov     x6, v0.d[0]                            //AES final-1 block - mov low

        mov     x7, v0.d[1]                            //AES final-1 block - mov high
        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid

        movi    v8.8b, #0                                        //suppress further partial tag feed in

        pmull   v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid

        eor     x6, x6, x13                   //AES final-1 block - round 10 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high

        eor     v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
        eor     x7, x7, x14                   //AES final-1 block - round 10 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
.L128_dec_blocks_more_than_1:   //blocks        left >  1

        rev64   v4.16b, v5.16b                                    //GHASH final-1 block

        ld1     { v5.16b}, [x0], #16                      //AES final block - load ciphertext
        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        mov     d22, v4.d[1]                                 //GHASH final-1 block - mid

        eor     v0.16b, v5.16b, v3.16b                            //AES final block - result

        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid

        stp     x6, x7, [x2], #16        //AES final-1 block  - store result
        mov     x6, v0.d[0]                            //AES final block - mov low

        mov     x7, v0.d[1]                            //AES final block - mov high
        ins     v22.d[1], v22.d[0]                            //GHASH final-1 block - mid

        pmull   v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low

        pmull2  v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high

        pmull2  v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
        movi    v8.8b, #0                                        //suppress further partial tag feed in

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
        eor     x7, x7, x14                   //AES final block - round 10 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        eor     x6, x6, x13                   //AES final block - round 10 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
.L128_dec_blocks_less_than_1:   //blocks        left <= 1

        mvn     x14, xzr                                      //rk10_h = 0xffffffffffffffff
        and     x1, x1, #127                    //bit_length %= 128

        mvn     x13, xzr                                      //rk10_l = 0xffffffffffffffff
        sub     x1, x1, #128                    //bit_length -= 128

        neg     x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])

        and     x1, x1, #127                    //bit_length %= 128

        lsr     x14, x14, x1                     //rk10_h is mask for top 64b of last block
        cmp     x1, #64

        csel    x10, x14, xzr, lt
        csel    x9, x13, x14, lt

        fmov    d0, x9                                   //ctr0b is mask for last block

        mov     v0.d[1], x10

        and     v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits

        rev64   v4.16b, v5.16b                                    //GHASH final block

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        ldp     x4, x5, [x2] //load existing bytes we need to not overwrite

        and     x7, x7, x10

        pmull2  v20.1q, v4.2d, v12.2d                          //GHASH final block - high
        mov     d8, v4.d[1]                                  //GHASH final block - mid

        eor     v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
        eor     v9.16b, v9.16b, v20.16b                            //GHASH final block - high

        pmull   v8.1q, v8.1d, v16.1d                          //GHASH final block - mid

        pmull   v21.1q, v4.1d, v12.1d                          //GHASH final block - low
        bic     x4, x4, x9           //mask out low existing bytes
        and     x6, x6, x9

#ifndef __AARCH64EB__
        rev     w9, w12
#else
        mov     w9, w12
#endif

        eor     v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
        movi    v8.8b, #0xc2

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final block - low

        bic     x5, x5, x10   //mask out high existing bytes
        shl     d8, d8, #56               //mod_constant

        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid

        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        orr     x6, x6, x4
        str     w9, [x16, #12]                          //store the updated counter

        orr     x7, x7, x5
        stp     x6, x7, [x2]
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid

        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b
        mov     x0, x15
        st1     { v11.16b }, [x3]

        ldp     x21, x22, [sp, #16]
        ldp     x23, x24, [sp, #32]
        ldp     d8, d9, [sp, #48]
        ldp     d10, d11, [sp, #64]
        ldp     d12, d13, [sp, #80]
        ldp     d14, d15, [sp, #96]
        ldp     x19, x20, [sp], #112
        ret

.L128_dec_ret:
        mov     w0, #0x0
        ret
.size   aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
.globl  aes_gcm_enc_192_kernel
.type   aes_gcm_enc_192_kernel,%function
.align  4
aes_gcm_enc_192_kernel:
        cbz     x1, .L192_enc_ret
        stp     x19, x20, [sp, #-112]!
        mov     x16, x4
        mov     x8, x5
        stp     x21, x22, [sp, #16]
        stp     x23, x24, [sp, #32]
        stp     d8, d9, [sp, #48]
        stp     d10, d11, [sp, #64]
        stp     d12, d13, [sp, #80]
        stp     d14, d15, [sp, #96]

        ldp     x10, x11, [x16]             //ctr96_b64, ctr96_t32
#ifdef __AARCH64EB__
        rev     x10, x10
        rev     x11, x11
#endif
        ldp     x13, x14, [x8, #192]                     //load rk12
#ifdef __AARCH64EB__
        ror     x13, x13, #32
        ror     x14, x14, #32
#endif
        ld1     {v18.4s}, [x8], #16                                  //load rk0

        ld1     {v19.4s}, [x8], #16                                  //load rk1

        ld1     {v20.4s}, [x8], #16                                  //load rk2

        lsr     x12, x11, #32
        ld1     {v21.4s}, [x8], #16                                  //load rk3
        orr     w11, w11, w11

        ld1     {v22.4s}, [x8], #16                                  //load rk4
        rev     w12, w12                               //rev_ctr32

        add     w12, w12, #1                           //increment rev_ctr32
        fmov    d3, x10                              //CTR block 3

        rev     w9, w12                                //CTR block 1
        add     w12, w12, #1                           //CTR block 1
        fmov    d1, x10                              //CTR block 1

        orr     x9, x11, x9, lsl #32           //CTR block 1
        ld1     { v0.16b}, [x16]                            //special case vector load initial counter so we can start first AES block as quickly as possible

        fmov    v1.d[1], x9                              //CTR block 1
        rev     w9, w12                                //CTR block 2
        add     w12, w12, #1                           //CTR block 2

        fmov    d2, x10                              //CTR block 2
        orr     x9, x11, x9, lsl #32           //CTR block 2

        fmov    v2.d[1], x9                              //CTR block 2
        rev     w9, w12                                //CTR block 3

        orr     x9, x11, x9, lsl #32           //CTR block 3
        ld1     {v23.4s}, [x8], #16                                  //load rk5

        fmov    v3.d[1], x9                              //CTR block 3

        ld1     {v24.4s}, [x8], #16                                  //load rk6

        ld1     {v25.4s}, [x8], #16                                  //load rk7

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 0
        ld1     { v11.16b}, [x3]
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 0
        ld1     {v26.4s}, [x8], #16                                  //load rk8

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 0
        ldr     q15, [x3, #112]                       //load h4l | h4h
#ifndef __AARCH64EB__
        ext     v15.16b, v15.16b, v15.16b, #8
#endif
        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 0
        ld1     {v27.4s}, [x8], #16                                  //load rk9

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 1
        ld1     {v28.4s}, [x8], #16                              //load rk10

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 1
        ldr     q12, [x3, #32]                        //load h1l | h1h
#ifndef __AARCH64EB__
        ext     v12.16b, v12.16b, v12.16b, #8
#endif
        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 1
        ld1     {v29.4s}, [x8], #16                              //load rk11

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 1
        ldr     q14, [x3, #80]                        //load h3l | h3h
#ifndef __AARCH64EB__
        ext     v14.16b, v14.16b, v14.16b, #8
#endif
        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 2

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 2

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 2

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 3
        trn1    v9.2d, v14.2d,    v15.2d                     //h4h | h3h

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 3

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 2
        trn2    v17.2d,  v14.2d,    v15.2d                     //h4l | h3l

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 4

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 3

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 3

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 5

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 4

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 4

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 6

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 4

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 5

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 5

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 5

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 6
        ldr     q13, [x3, #64]                        //load h2l | h2h
#ifndef __AARCH64EB__
        ext     v13.16b, v13.16b, v13.16b, #8
#endif
        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 6

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 6

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 7

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 7
        trn2    v16.2d,  v12.2d,    v13.2d                     //h2l | h1l

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 7

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 8

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 7
        trn1    v8.2d,    v12.2d,    v13.2d                     //h2h | h1h

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 8

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 8

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 9

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 9

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 9

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 9

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b         //AES block 0 - round 10

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b         //AES block 2 - round 10

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b         //AES block 1 - round 10
        lsr     x5, x1, #3             //byte_len
        mov     x15, x5

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b         //AES block 3 - round 10
        sub     x5, x5, #1     //byte_len - 1

        eor     v16.16b, v16.16b, v8.16b                    //h2k | h1k
        and     x5, x5, #0xffffffffffffffc0   //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

        eor     v17.16b, v17.16b, v9.16b                 //h4k | h3k

        aese    v2.16b, v29.16b                                    //AES block 2 - round 11
        add     x4, x0, x1, lsr #3  //end_input_ptr
        add     x5, x5, x0

        aese    v1.16b, v29.16b                                    //AES block 1 - round 11
        cmp     x0, x5                  //check if we have <= 4 blocks

        aese    v0.16b, v29.16b                                    //AES block 0 - round 11
        add     w12, w12, #1                           //CTR block 3

        aese    v3.16b, v29.16b                                    //AES block 3 - round 11
        b.ge    .L192_enc_tail                                   //handle tail

        rev     w9, w12                                //CTR block 4
        ldp     x6, x7, [x0, #0]           //AES block 0 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        orr     x9, x11, x9, lsl #32           //CTR block 4
        ldp     x21, x22, [x0, #32]          //AES block 2 - load plaintext
#ifdef __AARCH64EB__
        rev     x21, x21
        rev     x22, x22
#endif
        ldp     x23, x24, [x0, #48]          //AES block 3 - load plaintext
#ifdef __AARCH64EB__
        rev     x23, x23
        rev     x24, x24
#endif
        ldp     x19, x20, [x0, #16]          //AES block 1 - load plaintext
#ifdef __AARCH64EB__
        rev     x19, x19
        rev     x20, x20
#endif
        add     x0, x0, #64                      //AES input_ptr update
        cmp     x0, x5                  //check if we have <= 8 blocks

        eor     x6, x6, x13                    //AES block 0 - round 12 low

        eor     x7, x7, x14                    //AES block 0 - round 12 high
        eor     x22, x22, x14                    //AES block 2 - round 12 high
        fmov    d4, x6                              //AES block 0 - mov low

        eor     x24, x24, x14                    //AES block 3 - round 12 high
        fmov    v4.d[1], x7                          //AES block 0 - mov high

        eor     x21, x21, x13                    //AES block 2 - round 12 low
        eor     x19, x19, x13                    //AES block 1 - round 12 low

        fmov    d5, x19                              //AES block 1 - mov low
        eor     x20, x20, x14                    //AES block 1 - round 12 high

        fmov    v5.d[1], x20                          //AES block 1 - mov high

        eor     x23, x23, x13                    //AES block 3 - round 12 low
        fmov    d6, x21                              //AES block 2 - mov low

        add     w12, w12, #1                           //CTR block 4
        eor     v4.16b, v4.16b, v0.16b                         //AES block 0 - result
        fmov    d0, x10                              //CTR block 4

        fmov    v0.d[1], x9                              //CTR block 4
        rev     w9, w12                                //CTR block 5

        orr     x9, x11, x9, lsl #32           //CTR block 5
        add     w12, w12, #1                           //CTR block 5

        fmov    d7, x23                              //AES block 3 - mov low
        st1     { v4.16b}, [x2], #16                    //AES block 0 - store result

        fmov    v6.d[1], x22                          //AES block 2 - mov high

        eor     v5.16b, v5.16b, v1.16b                         //AES block 1 - result
        fmov    d1, x10                              //CTR block 5
        st1     { v5.16b}, [x2], #16                    //AES block 1 - store result

        fmov    v7.d[1], x24                          //AES block 3 - mov high

        fmov    v1.d[1], x9                              //CTR block 5
        rev     w9, w12                                //CTR block 6

        orr     x9, x11, x9, lsl #32           //CTR block 6

        add     w12, w12, #1                           //CTR block 6
        eor     v6.16b, v6.16b, v2.16b                         //AES block 2 - result
        fmov    d2, x10                              //CTR block 6

        fmov    v2.d[1], x9                              //CTR block 6
        rev     w9, w12                                //CTR block 7

        orr     x9, x11, x9, lsl #32           //CTR block 7
        st1     { v6.16b}, [x2], #16                    //AES block 2 - store result

        eor     v7.16b, v7.16b, v3.16b                         //AES block 3 - result
        st1     { v7.16b}, [x2], #16                    //AES block 3 - store result
        b.ge    .L192_enc_prepretail                             //do prepretail

.L192_enc_main_loop:    //main  loop start
        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 0
        rev64   v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 0
        ldp     x19, x20, [x0, #16]          //AES block 4k+5 - load plaintext
#ifdef __AARCH64EB__
        rev     x19, x19
        rev     x20, x20
#endif
        ext     v11.16b, v11.16b, v11.16b, #8                    //PRE 0
        fmov    d3, x10                              //CTR block 4k+3
        rev64   v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 1
        fmov    v3.d[1], x9                              //CTR block 4k+3

        pmull2  v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
        rev64   v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
        ldp     x21, x22, [x0, #32]          //AES block 4k+6 - load plaintext
#ifdef __AARCH64EB__
        rev     x21, x21
        rev     x22, x22
#endif
        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 0
        ldp     x23, x24, [x0, #48]          //AES block 4k+3 - load plaintext
#ifdef __AARCH64EB__
        rev     x23, x23
        rev     x24, x24
#endif
        pmull   v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
        eor     v4.16b, v4.16b, v11.16b                          //PRE 1

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 1

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 1
        rev64   v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 0
        eor     x24, x24, x14                    //AES block 4k+3 - round 12 high

        pmull   v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
        mov     d8, v4.d[1]                                 //GHASH block 4k - mid

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 2

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 1
        eor     x21, x21, x13                    //AES block 4k+6 - round 12 low

        eor     v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
        eor     v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 3
        eor     x19, x19, x13                    //AES block 4k+5 - round 12 low

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 2
        mov     d31, v6.d[1]                                 //GHASH block 4k+2 - mid

        pmull2  v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
        mov     d4, v5.d[1]                                 //GHASH block 4k+1 - mid

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 2

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 3

        mov     d10, v17.d[1]                              //GHASH block 4k - mid
        eor     v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 2
        eor     v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid

        pmull2  v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 4
        eor     v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 3

        pmull2  v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
        eor     x20, x20, x14                    //AES block 4k+5 - round 12 high
        ins     v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 5
        add     w12, w12, #1                           //CTR block 4k+3

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 4
        eor     v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high

        pmull   v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
        eor     x22, x22, x14                    //AES block 4k+6 - round 12 high

        pmull2  v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
        eor     x23, x23, x13                    //AES block 4k+3 - round 12 low
        mov     d30, v7.d[1]                                 //GHASH block 4k+3 - mid

        pmull   v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
        rev     w9, w12                                //CTR block 4k+8

        pmull   v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
        orr     x9, x11, x9, lsl #32           //CTR block 4k+8

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 3
        eor     v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 4
        ldp     x6, x7, [x0, #0]           //AES block 4k+4 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 6
        eor     v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 4
        add     x0, x0, #64                      //AES input_ptr update

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 5
        movi    v8.8b, #0xc2

        pmull   v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
        eor     x7, x7, x14                    //AES block 4k+4 - round 12 high
        eor     v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 5
        eor     x6, x6, x13                    //AES block 4k+4 - round 12 low

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 6
        shl     d8, d8, #56              //mod_constant

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 5
        eor     v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 7
        fmov    d5, x19                              //AES block 4k+5 - mov low

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 7
        eor     v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 6
        fmov    v5.d[1], x20                          //AES block 4k+5 - mov high

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 8
        eor     v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low

        pmull   v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
        cmp     x0, x5                  //.LOOP CONTROL
        fmov    d4, x6                              //AES block 4k+4 - mov low

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 6
        fmov    v4.d[1], x7                          //AES block 4k+4 - mov high

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 8
        fmov    d7, x23                              //AES block 4k+3 - mov low

        eor     v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
        eor     v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
        add     w12, w12, #1                           //CTR block 4k+8

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 7
        fmov    v7.d[1], x24                          //AES block 4k+3 - mov high

        pmull   v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
        ext     v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
        fmov    d6, x21                              //AES block 4k+6 - mov low

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 7

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 9
        eor     v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 8

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 9

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 10
        eor     v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 9

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 9

        aese    v0.16b, v29.16b                                    //AES block 4k+4 - round 11

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 10
        eor     v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 10

        eor     v4.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
        fmov    d0, x10                              //CTR block 4k+8

        aese    v1.16b, v29.16b                                    //AES block 4k+5 - round 11
        fmov    v0.d[1], x9                              //CTR block 4k+8
        rev     w9, w12                                //CTR block 4k+9

        pmull   v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
        fmov    v6.d[1], x22                          //AES block 4k+6 - mov high
        st1     { v4.16b}, [x2], #16                    //AES block 4k+4 - store result

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 10
        orr     x9, x11, x9, lsl #32           //CTR block 4k+9

        eor     v5.16b, v5.16b, v1.16b                         //AES block 4k+5 - result
        add     w12, w12, #1                           //CTR block 4k+9
        fmov    d1, x10                              //CTR block 4k+9

        aese    v2.16b, v29.16b                                    //AES block 4k+6 - round 11
        fmov    v1.d[1], x9                              //CTR block 4k+9
        rev     w9, w12                                //CTR block 4k+10

        add     w12, w12, #1                           //CTR block 4k+10
        ext     v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
        orr     x9, x11, x9, lsl #32           //CTR block 4k+10

        st1     { v5.16b}, [x2], #16                    //AES block 4k+5 - store result
        eor     v11.16b, v11.16b, v9.16b                        //MODULO - fold into low

        aese    v3.16b, v29.16b                                    //AES block 4k+7 - round 11
        eor     v6.16b, v6.16b, v2.16b                         //AES block 4k+6 - result
        fmov    d2, x10                              //CTR block 4k+10

        st1     { v6.16b}, [x2], #16                    //AES block 4k+6 - store result
        fmov    v2.d[1], x9                              //CTR block 4k+10
        rev     w9, w12                                //CTR block 4k+11

        eor     v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
        orr     x9, x11, x9, lsl #32           //CTR block 4k+11

        eor     v7.16b, v7.16b, v3.16b                         //AES block 4k+3 - result
        st1     { v7.16b}, [x2], #16                    //AES block 4k+3 - store result
        b.lt    .L192_enc_main_loop

.L192_enc_prepretail:   //PREPRETAIL
        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 0
        rev64   v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)

        fmov    d3, x10                              //CTR block 4k+3
        ext     v11.16b, v11.16b, v11.16b, #8                    //PRE 0
        add     w12, w12, #1                           //CTR block 4k+3

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 0
        rev64   v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 0

        fmov    v3.d[1], x9                              //CTR block 4k+3
        eor     v4.16b, v4.16b, v11.16b                          //PRE 1
        mov     d10, v17.d[1]                              //GHASH block 4k - mid

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 1
        rev64   v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)

        pmull2  v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high

        pmull   v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
        mov     d8, v4.d[1]                                 //GHASH block 4k - mid

        pmull   v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
        rev64   v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)

        pmull2  v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high

        eor     v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
        mov     d4, v5.d[1]                                 //GHASH block 4k+1 - mid

        eor     v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
        mov     d31, v6.d[1]                                 //GHASH block 4k+2 - mid

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 0
        eor     v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high

        pmull2  v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high

        eor     v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
        eor     v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 1

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 1
        eor     v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 1

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 2
        mov     d30, v7.d[1]                                 //GHASH block 4k+3 - mid

        pmull2  v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
        ins     v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 2

        pmull   v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
        eor     v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 3

        pmull2  v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid

        pmull   v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid

        pmull   v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
        eor     v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high

        pmull   v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 3
        eor     v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 2

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 2
        eor     v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 4

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 3
        eor     v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 3

        pmull   v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
        movi    v8.8b, #0xc2

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 4

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 4

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 4
        eor     v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 5

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 5

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 5
        eor     v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 5

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 6
        eor     v10.16b, v10.16b, v9.16b                        //karatsuba tidy up

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 6

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 6
        shl     d8, d8, #56              //mod_constant

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 7

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 7
        eor     v10.16b, v10.16b, v11.16b

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 7

        pmull   v30.1q, v9.1d, v8.1d

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 6
        ext     v9.16b, v9.16b, v9.16b, #8

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 8

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 8
        eor     v10.16b, v10.16b, v30.16b

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 7

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 8

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 9

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 8
        eor     v10.16b, v10.16b, v9.16b

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 9

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 9

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 9

        pmull   v30.1q, v10.1d, v8.1d

        ext     v10.16b, v10.16b, v10.16b, #8

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b         //AES block 4k+7 - round 10

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b         //AES block 4k+4 - round 10

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b         //AES block 4k+6 - round 10

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b         //AES block 4k+5 - round 10
        eor     v11.16b, v11.16b, v30.16b

        aese    v0.16b, v29.16b                                    //AES block 4k+4 - round 11

        aese    v3.16b, v29.16b                                    //AES block 4k+7 - round 11

        aese    v2.16b, v29.16b                                    //AES block 4k+6 - round 11

        aese    v1.16b, v29.16b                                    //AES block 4k+5 - round 11
        eor     v11.16b, v11.16b, v10.16b
.L192_enc_tail: //TAIL

        sub     x5, x4, x0  //main_end_input_ptr is number of bytes left to process
        ldp     x6, x7, [x0], #16          //AES block 4k+4 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        eor     x6, x6, x13                    //AES block 4k+4 - round 12 low
        eor     x7, x7, x14                    //AES block 4k+4 - round 12 high

        fmov    d4, x6                              //AES block 4k+4 - mov low

        fmov    v4.d[1], x7                          //AES block 4k+4 - mov high
        cmp     x5, #48

        eor     v5.16b, v4.16b, v0.16b                         //AES block 4k+4 - result

        ext     v8.16b, v11.16b, v11.16b, #8                    //prepare final partial tag
        b.gt    .L192_enc_blocks_more_than_3

        sub     w12, w12, #1
        movi    v10.8b, #0

        mov     v3.16b, v2.16b
        movi    v9.8b, #0
        cmp     x5, #32

        mov     v2.16b, v1.16b
        movi    v11.8b, #0
        b.gt    .L192_enc_blocks_more_than_2

        sub     w12, w12, #1

        mov     v3.16b, v1.16b
        cmp     x5, #16
        b.gt    .L192_enc_blocks_more_than_1

        sub     w12, w12, #1
        b       .L192_enc_blocks_less_than_1
.L192_enc_blocks_more_than_3:   //blocks        left >  3
        st1     { v5.16b}, [x2], #16                    //AES final-3 block  - store result

        ldp     x6, x7, [x0], #16          //AES final-2 block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        rev64   v4.16b, v5.16b                                   //GHASH final-3 block

        eor     x6, x6, x13                    //AES final-2 block - round 12 low
        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        eor     x7, x7, x14                    //AES final-2 block - round 12 high
        fmov    d5, x6                                //AES final-2 block - mov low

        fmov    v5.d[1], x7                            //AES final-2 block - mov high

        mov     d22, v4.d[1]                                //GHASH final-3 block - mid

        pmull   v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low

        mov     d10, v17.d[1]                              //GHASH final-3 block - mid

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid

        movi    v8.8b, #0                                       //suppress further partial tag feed in

        pmull2  v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high

        pmull   v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
        eor     v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
.L192_enc_blocks_more_than_2:   //blocks        left >  2

        st1     { v5.16b}, [x2], #16                    //AES final-2 block - store result

        rev64   v4.16b, v5.16b                                   //GHASH final-2 block
        ldp     x6, x7, [x0], #16          //AES final-1 block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        eor     x7, x7, x14                    //AES final-1 block - round 12 high

        pmull2  v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
        mov     d22, v4.d[1]                                //GHASH final-2 block - mid

        pmull   v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
        eor     x6, x6, x13                    //AES final-1 block - round 12 low

        fmov    d5, x6                                //AES final-1 block - mov low

        fmov    v5.d[1], x7                            //AES final-1 block - mov high
        eor     v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid

        eor     v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low

        pmull   v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid

        movi    v8.8b, #0                                       //suppress further partial tag feed in

        eor     v5.16b, v5.16b, v2.16b                           //AES final-1 block - result

        eor     v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
.L192_enc_blocks_more_than_1:   //blocks        left >  1

        st1     { v5.16b}, [x2], #16                    //AES final-1 block - store result

        ldp     x6, x7, [x0], #16          //AES final block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        rev64   v4.16b, v5.16b                                   //GHASH final-1 block

        eor     x6, x6, x13                    //AES final block - round 12 low
        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag
        movi    v8.8b, #0                                       //suppress further partial tag feed in

        mov     d22, v4.d[1]                                //GHASH final-1 block - mid

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
        eor     x7, x7, x14                    //AES final block - round 12 high
        fmov    d5, x6                                //AES final block - mov low

        pmull2  v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
        fmov    v5.d[1], x7                            //AES final block - mov high

        ins     v22.d[1], v22.d[0]                           //GHASH final-1 block - mid

        eor     v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high

        pmull   v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low

        pmull2  v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid

        eor     v5.16b, v5.16b, v3.16b                           //AES final block - result

        eor     v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low

        eor     v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
.L192_enc_blocks_less_than_1:   //blocks        left <= 1

        ld1     { v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
#ifndef __AARCH64EB__
        rev     w9, w12
#else
        mov     w9, w12
#endif
        and     x1, x1, #127                   //bit_length %= 128

        sub     x1, x1, #128                   //bit_length -= 128
        mvn     x14, xzr                                     //rk12_h = 0xffffffffffffffff

        neg     x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
        mvn     x13, xzr                                     //rk12_l = 0xffffffffffffffff

        and     x1, x1, #127                   //bit_length %= 128

        lsr     x14, x14, x1                    //rk12_h is mask for top 64b of last block
        cmp     x1, #64

        csel    x6, x13, x14, lt
        csel    x7, x14, xzr, lt

        fmov    d0, x6                                //ctr0b is mask for last block

        fmov    v0.d[1], x7

        and     v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits

        rev64   v4.16b, v5.16b                                   //GHASH final block

        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        mov     d8, v4.d[1]                                 //GHASH final block - mid

        pmull   v21.1q, v4.1d, v12.1d                         //GHASH final block - low

        pmull2  v20.1q, v4.2d, v12.2d                         //GHASH final block - high

        eor     v8.8b, v8.8b, v4.8b                         //GHASH final block - mid

        eor     v11.16b, v11.16b, v21.16b                           //GHASH final block - low

        eor     v9.16b, v9.16b, v20.16b                           //GHASH final block - high

        pmull   v8.1q, v8.1d, v16.1d                         //GHASH final block - mid

        eor     v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
        movi    v8.8b, #0xc2

        eor     v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up

        shl     d8, d8, #56              //mod_constant

        bif     v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing

        eor     v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up

        pmull   v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid

        ext     v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment

        eor     v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid

        eor     v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid

        pmull   v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low

        ext     v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment

        eor     v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
        str     w9, [x16, #12]                         //store the updated counter

        st1     { v5.16b}, [x2]                         //store all 16B

        eor     v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b
        mov     x0, x15
        st1     { v11.16b }, [x3]

        ldp     x21, x22, [sp, #16]
        ldp     x23, x24, [sp, #32]
        ldp     d8, d9, [sp, #48]
        ldp     d10, d11, [sp, #64]
        ldp     d12, d13, [sp, #80]
        ldp     d14, d15, [sp, #96]
        ldp     x19, x20, [sp], #112
        ret

.L192_enc_ret:
        mov     w0, #0x0
        ret
.size   aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
.globl  aes_gcm_dec_192_kernel
.type   aes_gcm_dec_192_kernel,%function
.align  4
aes_gcm_dec_192_kernel:
        cbz     x1, .L192_dec_ret
        stp     x19, x20, [sp, #-112]!
        mov     x16, x4
        mov     x8, x5
        stp     x21, x22, [sp, #16]
        stp     x23, x24, [sp, #32]
        stp     d8, d9, [sp, #48]
        stp     d10, d11, [sp, #64]
        stp     d12, d13, [sp, #80]
        stp     d14, d15, [sp, #96]

        add     x4, x0, x1, lsr #3   //end_input_ptr
        ldp     x10, x11, [x16]              //ctr96_b64, ctr96_t32
#ifdef __AARCH64EB__
        rev     x10, x10
        rev     x11, x11
#endif
        ldp     x13, x14, [x8, #192]                     //load rk12
#ifdef __AARCH64EB__
        ror     x13, x13, #32
        ror     x14, x14, #32
#endif
        ld1     { v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible

        ld1     {v18.4s}, [x8], #16                                  //load rk0

        lsr     x5, x1, #3              //byte_len
        mov     x15, x5
        ld1     {v19.4s}, [x8], #16                               //load rk1

        lsr     x12, x11, #32
        orr     w11, w11, w11
        fmov    d3, x10                               //CTR block 3

        rev     w12, w12                                //rev_ctr32
        fmov    d1, x10                               //CTR block 1

        add     w12, w12, #1                            //increment rev_ctr32
        ld1     {v20.4s}, [x8], #16                               //load rk2

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 0
        rev     w9, w12                                 //CTR block 1

        add     w12, w12, #1                            //CTR block 1
        orr     x9, x11, x9, lsl #32            //CTR block 1
        ld1     {v21.4s}, [x8], #16                               //load rk3

        fmov    v1.d[1], x9                               //CTR block 1
        rev     w9, w12                                 //CTR block 2
        add     w12, w12, #1                            //CTR block 2

        fmov    d2, x10                               //CTR block 2
        orr     x9, x11, x9, lsl #32            //CTR block 2

        fmov    v2.d[1], x9                               //CTR block 2
        rev     w9, w12                                 //CTR block 3

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 1
        orr     x9, x11, x9, lsl #32            //CTR block 3

        fmov    v3.d[1], x9                               //CTR block 3

        ld1     {v22.4s}, [x8], #16                               //load rk4

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 2

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 0
        ld1     {v23.4s}, [x8], #16                               //load rk5

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 0
        ldr     q15, [x3, #112]                        //load h4l | h4h
#ifndef __AARCH64EB__
        ext     v15.16b, v15.16b, v15.16b, #8
#endif
        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 0
        ldr     q13, [x3, #64]                         //load h2l | h2h
#ifndef __AARCH64EB__
        ext     v13.16b, v13.16b, v13.16b, #8
#endif
        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 1
        ldr     q14, [x3, #80]                         //load h3l | h3h
#ifndef __AARCH64EB__
        ext     v14.16b, v14.16b, v14.16b, #8
#endif
        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 1

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 1
        ldr     q12, [x3, #32]                         //load h1l | h1h
#ifndef __AARCH64EB__
        ext     v12.16b, v12.16b, v12.16b, #8
#endif
        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 2
        ld1     {v24.4s}, [x8], #16                               //load rk6

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 3
        ld1     {v25.4s}, [x8], #16                               //load rk7

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 2
        ld1     {v26.4s}, [x8], #16                               //load rk8

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 2
        ld1     {v27.4s}, [x8], #16                               //load rk9

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 3
        ld1     { v11.16b}, [x3]
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 3
        add     w12, w12, #1                            //CTR block 3

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 3
        trn1    v9.2d, v14.2d,    v15.2d                      //h4h | h3h

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 4
        ld1     {v28.4s}, [x8], #16                              //load rk10

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 4
        trn2    v17.2d,  v14.2d,    v15.2d                      //h4l | h3l

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 4

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 4
        trn2    v16.2d,  v12.2d,    v13.2d                      //h2l | h1l

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 5
        ld1     {v29.4s}, [x8], #16                              //load rk11

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 5

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 5

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 5

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 6

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 6

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 6

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 7

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 7

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 7

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 6

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 8

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 7

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 9

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 9

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 8
        sub     x5, x5, #1      //byte_len - 1

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 8
        and     x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 10
        add     x5, x5, x0

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 9
        cmp     x0, x5                   //check if we have <= 4 blocks

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 9
        trn1    v8.2d,    v12.2d,    v13.2d                      //h2h | h1h

        aese    v3.16b, v29.16b                                     //AES block 3 - round 11

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 10

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 10

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 10
        eor     v16.16b, v16.16b, v8.16b                     //h2k | h1k

        aese    v2.16b, v29.16b                                     //AES block 2 - round 11

        aese    v1.16b, v29.16b                                     //AES block 1 - round 11
        eor     v17.16b, v17.16b, v9.16b                  //h4k | h3k

        aese    v0.16b, v29.16b                                     //AES block 0 - round 11
        b.ge    .L192_dec_tail                                    //handle tail

        ld1     {v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext

        eor     v1.16b, v5.16b, v1.16b                            //AES block 1 - result

        eor     v0.16b, v4.16b, v0.16b                            //AES block 0 - result
        rev     w9, w12                                 //CTR block 4
        ld1     {v6.16b, v7.16b}, [x0], #32               //AES block 2,3 - load ciphertext

        mov     x19, v1.d[0]                            //AES block 1 - mov low

        mov     x20, v1.d[1]                            //AES block 1 - mov high

        mov     x6, v0.d[0]                            //AES block 0 - mov low
        orr     x9, x11, x9, lsl #32            //CTR block 4
        add     w12, w12, #1                            //CTR block 4

        mov     x7, v0.d[1]                            //AES block 0 - mov high
        rev64   v4.16b, v4.16b                                    //GHASH block 0

        fmov    d0, x10                               //CTR block 4
        rev64   v5.16b, v5.16b                                    //GHASH block 1
        cmp     x0, x5                   //check if we have <= 8 blocks

        eor     x19, x19, x13                   //AES block 1 - round 12 low
#ifdef __AARCH64EB__
        rev     x19, x19
#endif
        fmov    v0.d[1], x9                               //CTR block 4
        rev     w9, w12                                 //CTR block 5

        orr     x9, x11, x9, lsl #32            //CTR block 5
        fmov    d1, x10                               //CTR block 5
        eor     x20, x20, x14                   //AES block 1 - round 12 high
#ifdef __AARCH64EB__
        rev     x20, x20
#endif
        add     w12, w12, #1                            //CTR block 5
        fmov    v1.d[1], x9                               //CTR block 5
        eor     x6, x6, x13                   //AES block 0 - round 12 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        rev     w9, w12                                 //CTR block 6
        eor     x7, x7, x14                   //AES block 0 - round 12 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        stp     x6, x7, [x2], #16        //AES block 0 - store result
        orr     x9, x11, x9, lsl #32            //CTR block 6

        stp     x19, x20, [x2], #16        //AES block 1 - store result

        add     w12, w12, #1                            //CTR block 6
        eor     v2.16b, v6.16b, v2.16b                            //AES block 2 - result
        b.ge    .L192_dec_prepretail                              //do prepretail

.L192_dec_main_loop:    //main  loop start
        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0

        pmull   v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
        mov     x21, v2.d[0]                            //AES block 4k+2 - mov low

        mov     x22, v2.d[1]                            //AES block 4k+2 - mov high
        eor     v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        fmov    d2, x10                               //CTR block 4k+6

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        eor     v4.16b, v4.16b, v11.16b                           //PRE 1

        pmull2  v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
        fmov    v2.d[1], x9                               //CTR block 4k+6

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        mov     x24, v3.d[1]                            //AES block 4k+3 - mov high

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        mov     x23, v3.d[0]                            //AES block 4k+3 - mov low

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        fmov    d3, x10                               //CTR block 4k+7
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        mov     d10, v17.d[1]                               //GHASH block 4k - mid
        rev     w9, w12                                 //CTR block 4k+7

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        orr     x9, x11, x9, lsl #32            //CTR block 4k+7

        fmov    v3.d[1], x9                               //CTR block 4k+7
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
        mov     d4, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
        eor     x22, x22, x14                   //AES block 4k+2 - round 12 high
#ifdef __AARCH64EB__
        rev     x22, x22
#endif
        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        eor     v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2

        pmull   v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
        eor     v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
        eor     x21, x21, x13                   //AES block 4k+2 - round 12 low
#ifdef __AARCH64EB__
        rev     x21, x21
#endif
        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3

        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
        mov     d31, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
        eor     v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4

        pmull2  v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
        eor     v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid

        pmull   v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5

        eor     v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
        mov     d30, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        pmull2  v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        eor     v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
        ins     v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3

        pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
        eor     v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7

        pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
        eor     v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        movi    v8.8b, #0xc2

        pmull   v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
        eor     v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 9
        eor     v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 10

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 9
        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
        shl     d8, d8, #56               //mod_constant

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 10

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
        ld1     {v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
        ld1     {v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
        eor     x23, x23, x13                   //AES block 4k+3 - round 12 low
#ifdef __AARCH64EB__
        rev     x23, x23
#endif
        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v0.16b, v29.16b                                     //AES block 4k+4 - round 11
        add     w12, w12, #1                            //CTR block 4k+7

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
        ld1     {v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext

        aese    v1.16b, v29.16b                                     //AES block 4k+5 - round 11
        ld1     {v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
        rev     w9, w12                                 //CTR block 4k+8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
        stp     x21, x22, [x2], #16        //AES block 4k+2 - store result

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 9
        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        cmp     x0, x5                   //.LOOP CONTROL

        eor     v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
        eor     x24, x24, x14                   //AES block 4k+3 - round 12 high
#ifdef __AARCH64EB__
        rev     x24, x24
#endif
        eor     v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 10
        orr     x9, x11, x9, lsl #32            //CTR block 4k+8

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 9

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
        mov     x19, v1.d[0]                            //AES block 4k+5 - mov low

        mov     x6, v0.d[0]                            //AES block 4k+4 - mov low
        stp     x23, x24, [x2], #16        //AES block 4k+3 - store result
        rev64   v5.16b, v5.16b                                    //GHASH block 4k+5

        aese    v2.16b, v29.16b                                     //AES block 4k+6 - round 11
        mov     x7, v0.d[1]                            //AES block 4k+4 - mov high

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 10
        mov     x20, v1.d[1]                            //AES block 4k+5 - mov high

        fmov    d0, x10                               //CTR block 4k+8
        add     w12, w12, #1                            //CTR block 4k+8
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        eor     v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
        fmov    v0.d[1], x9                               //CTR block 4k+8
        rev     w9, w12                                 //CTR block 4k+9

        eor     x6, x6, x13                   //AES block 4k+4 - round 12 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        orr     x9, x11, x9, lsl #32            //CTR block 4k+9
        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        fmov    d1, x10                               //CTR block 4k+9
        add     w12, w12, #1                            //CTR block 4k+9
        eor     x19, x19, x13                   //AES block 4k+5 - round 12 low
#ifdef __AARCH64EB__
        rev     x19, x19
#endif
        fmov    v1.d[1], x9                               //CTR block 4k+9
        rev     w9, w12                                 //CTR block 4k+10
        eor     x20, x20, x14                   //AES block 4k+5 - round 12 high
#ifdef __AARCH64EB__
        rev     x20, x20
#endif
        eor     x7, x7, x14                   //AES block 4k+4 - round 12 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        stp     x6, x7, [x2], #16        //AES block 4k+4 - store result
        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low

        add     w12, w12, #1                            //CTR block 4k+10
        rev64   v4.16b, v4.16b                                    //GHASH block 4k+4
        orr     x9, x11, x9, lsl #32            //CTR block 4k+10

        aese    v3.16b, v29.16b                                     //AES block 4k+7 - round 11
        stp     x19, x20, [x2], #16        //AES block 4k+5 - store result
        b.lt    .L192_dec_main_loop

.L192_dec_prepretail:   //PREPRETAIL
        mov     x22, v2.d[1]                            //AES block 4k+2 - mov high
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
        eor     v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        mov     x21, v2.d[0]                            //AES block 4k+2 - mov low

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        eor     v4.16b, v4.16b, v11.16b                           //PRE 1
        fmov    d2, x10                               //CTR block 4k+6

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        mov     x23, v3.d[0]                            //AES block 4k+3 - mov low

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        mov     x24, v3.d[1]                            //AES block 4k+3 - mov high

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid
        fmov    d3, x10                               //CTR block 4k+7

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        fmov    v2.d[1], x9                               //CTR block 4k+6
        rev     w9, w12                                 //CTR block 4k+7

        orr     x9, x11, x9, lsl #32            //CTR block 4k+7
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
        mov     d4, v5.d[1]                                  //GHASH block 4k+1 - mid

        pmull   v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
        eor     x24, x24, x14                   //AES block 4k+3 - round 12 high
#ifdef __AARCH64EB__
        rev     x24, x24
#endif
        fmov    v3.d[1], x9                               //CTR block 4k+7

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
        eor     x21, x21, x13                   //AES block 4k+2 - round 12 low
#ifdef __AARCH64EB__
        rev     x21, x21
#endif
        pmull2  v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
        eor     x22, x22, x14                   //AES block 4k+2 - round 12 high
#ifdef __AARCH64EB__
        rev     x22, x22
#endif
        eor     v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
        eor     x23, x23, x13                   //AES block 4k+3 - round 12 low
#ifdef __AARCH64EB__
        rev     x23, x23
#endif
        stp     x21, x22, [x2], #16        //AES block 4k+2 - store result

        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3
        stp     x23, x24, [x2], #16        //AES block 4k+3 - store result

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        eor     v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high

        pmull   v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
        add     w12, w12, #1                            //CTR block 4k+7

        pmull2  v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
        eor     v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0

        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
        mov     d31, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        eor     v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high

        eor     v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid

        pmull   v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
        mov     d30, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        ins     v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid

        pmull   v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
        eor     v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3

        pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
        eor     v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4

        pmull2  v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
        movi    v8.8b, #0xc2

        pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3

        shl     d8, d8, #56               //mod_constant
        eor     v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        eor     v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4

        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
        eor     v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
        eor     v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 9

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 10

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 9

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 9

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 9

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 10

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 10
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 10

        aese    v0.16b, v29.16b
        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        aese    v2.16b, v29.16b

        aese    v1.16b, v29.16b

        aese    v3.16b, v29.16b

        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
.L192_dec_tail: //TAIL

        sub     x5, x4, x0   //main_end_input_ptr is number of bytes left to process
        ld1     { v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext

        eor     v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result

        mov     x7, v0.d[1]                            //AES block 4k+4 - mov high

        mov     x6, v0.d[0]                            //AES block 4k+4 - mov low

        ext     v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag

        cmp     x5, #48

        eor     x7, x7, x14                   //AES block 4k+4 - round 12 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        eor     x6, x6, x13                   //AES block 4k+4 - round 12 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        b.gt    .L192_dec_blocks_more_than_3

        movi    v11.8b, #0
        movi    v9.8b, #0

        mov     v3.16b, v2.16b
        mov     v2.16b, v1.16b
        sub     w12, w12, #1

        movi    v10.8b, #0
        cmp     x5, #32
        b.gt    .L192_dec_blocks_more_than_2

        mov     v3.16b, v1.16b
        cmp     x5, #16
        sub     w12, w12, #1

        b.gt    .L192_dec_blocks_more_than_1

        sub     w12, w12, #1
        b       .L192_dec_blocks_less_than_1
.L192_dec_blocks_more_than_3:   //blocks        left >  3
        rev64   v4.16b, v5.16b                                    //GHASH final-3 block
        ld1     { v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext

        stp     x6, x7, [x2], #16        //AES final-3 block  - store result

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        eor     v0.16b, v5.16b, v1.16b                            //AES final-2 block - result

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
        mov     x6, v0.d[0]                            //AES final-2 block - mov low
        mov     d22, v4.d[1]                                 //GHASH final-3 block - mid

        mov     x7, v0.d[1]                            //AES final-2 block - mov high

        mov     d10, v17.d[1]                               //GHASH final-3 block - mid
        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high

        eor     x6, x6, x13                   //AES final-2 block - round 12 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        movi    v8.8b, #0                                        //suppress further partial tag feed in

        pmull   v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
        eor     x7, x7, x14                   //AES final-2 block - round 12 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
.L192_dec_blocks_more_than_2:   //blocks        left >  2

        rev64   v4.16b, v5.16b                                    //GHASH final-2 block
        ld1     { v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        movi    v8.8b, #0                                        //suppress further partial tag feed in

        eor     v0.16b, v5.16b, v2.16b                            //AES final-1 block - result

        mov     d22, v4.d[1]                                 //GHASH final-2 block - mid

        pmull   v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low

        stp     x6, x7, [x2], #16        //AES final-2 block  - store result

        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
        mov     x7, v0.d[1]                            //AES final-1 block - mov high

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
        mov     x6, v0.d[0]                            //AES final-1 block - mov low

        pmull2  v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high

        pmull   v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
        eor     x7, x7, x14                   //AES final-1 block - round 12 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        eor     x6, x6, x13                   //AES final-1 block - round 12 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
.L192_dec_blocks_more_than_1:   //blocks        left >  1

        rev64   v4.16b, v5.16b                                    //GHASH final-1 block

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag
        ld1     { v5.16b}, [x0], #16                      //AES final block - load ciphertext

        mov     d22, v4.d[1]                                 //GHASH final-1 block - mid

        pmull2  v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high

        eor     v0.16b, v5.16b, v3.16b                            //AES final block - result
        stp     x6, x7, [x2], #16        //AES final-1 block  - store result

        eor     v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high

        pmull   v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
        mov     x7, v0.d[1]                            //AES final block - mov high

        ins     v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
        mov     x6, v0.d[0]                            //AES final block - mov low

        pmull2  v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid

        movi    v8.8b, #0                                        //suppress further partial tag feed in
        eor     v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
        eor     x7, x7, x14                   //AES final block - round 12 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        eor     x6, x6, x13                   //AES final block - round 12 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
.L192_dec_blocks_less_than_1:   //blocks        left <= 1

        mvn     x13, xzr                                      //rk12_l = 0xffffffffffffffff
        ldp     x4, x5, [x2]  //load existing bytes we need to not overwrite
        and     x1, x1, #127                    //bit_length %= 128

        sub     x1, x1, #128                    //bit_length -= 128

        neg     x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])

        and     x1, x1, #127                    //bit_length %= 128
        mvn     x14, xzr                                      //rk12_h = 0xffffffffffffffff

        lsr     x14, x14, x1                     //rk12_h is mask for top 64b of last block
        cmp     x1, #64

        csel    x9, x13, x14, lt
        csel    x10, x14, xzr, lt

        fmov    d0, x9                                   //ctr0b is mask for last block
        and     x6, x6, x9
        bic     x4, x4, x9           //mask out low existing bytes

        orr     x6, x6, x4
        mov     v0.d[1], x10
#ifndef __AARCH64EB__
        rev     w9, w12
#else
        mov     w9, w12
#endif

        and     v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
        str     w9, [x16, #12]                          //store the updated counter

        rev64   v4.16b, v5.16b                                    //GHASH final block

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag
        bic     x5, x5, x10 //mask out high existing bytes

        and     x7, x7, x10

        pmull2  v20.1q, v4.2d, v12.2d                          //GHASH final block - high
        mov     d8, v4.d[1]                                  //GHASH final block - mid

        pmull   v21.1q, v4.1d, v12.1d                          //GHASH final block - low

        eor     v8.8b, v8.8b, v4.8b                          //GHASH final block - mid

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final block - high

        pmull   v8.1q, v8.1d, v16.1d                          //GHASH final block - mid

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final block - low

        eor     v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
        movi    v8.8b, #0xc2

        eor     v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        shl     d8, d8, #56               //mod_constant

        eor     v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up

        pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
        orr     x7, x7, x5
        stp     x6, x7, [x2]

        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid

        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low

        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b
        mov     x0, x15
        st1     { v11.16b }, [x3]

        ldp     x21, x22, [sp, #16]
        ldp     x23, x24, [sp, #32]
        ldp     d8, d9, [sp, #48]
        ldp     d10, d11, [sp, #64]
        ldp     d12, d13, [sp, #80]
        ldp     d14, d15, [sp, #96]
        ldp     x19, x20, [sp], #112
        ret

.L192_dec_ret:
        mov     w0, #0x0
        ret
.size   aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
.globl  aes_gcm_enc_256_kernel
.type   aes_gcm_enc_256_kernel,%function
.align  4
aes_gcm_enc_256_kernel:
        cbz     x1, .L256_enc_ret
        stp     x19, x20, [sp, #-112]!
        mov     x16, x4
        mov     x8, x5
        stp     x21, x22, [sp, #16]
        stp     x23, x24, [sp, #32]
        stp     d8, d9, [sp, #48]
        stp     d10, d11, [sp, #64]
        stp     d12, d13, [sp, #80]
        stp     d14, d15, [sp, #96]

        add     x4, x0, x1, lsr #3   //end_input_ptr
        lsr     x5, x1, #3              //byte_len
        mov     x15, x5
        ldp     x10, x11, [x16]              //ctr96_b64, ctr96_t32
#ifdef __AARCH64EB__
        rev     x10, x10
        rev     x11, x11
#endif
        ldp     x13, x14, [x8, #224]                     //load rk14
#ifdef __AARCH64EB__
        ror     x13, x13, #32
        ror     x14, x14, #32
#endif
        ld1     { v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
        sub     x5, x5, #1      //byte_len - 1

        ld1     {v18.4s}, [x8], #16                               //load rk0
        and     x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

        ld1     {v19.4s}, [x8], #16                               //load rk1
        add     x5, x5, x0

        lsr     x12, x11, #32
        fmov    d2, x10                               //CTR block 2
        orr     w11, w11, w11

        rev     w12, w12                                //rev_ctr32
        cmp     x0, x5                   //check if we have <= 4 blocks
        fmov    d1, x10                               //CTR block 1

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 0
        add     w12, w12, #1                            //increment rev_ctr32

        rev     w9, w12                                 //CTR block 1
        fmov    d3, x10                               //CTR block 3

        orr     x9, x11, x9, lsl #32            //CTR block 1
        add     w12, w12, #1                            //CTR block 1
        ld1     {v20.4s}, [x8], #16                               //load rk2

        fmov    v1.d[1], x9                               //CTR block 1
        rev     w9, w12                                 //CTR block 2
        add     w12, w12, #1                            //CTR block 2

        orr     x9, x11, x9, lsl #32            //CTR block 2
        ld1     {v21.4s}, [x8], #16                               //load rk3

        fmov    v2.d[1], x9                               //CTR block 2
        rev     w9, w12                                 //CTR block 3

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 1
        orr     x9, x11, x9, lsl #32            //CTR block 3

        fmov    v3.d[1], x9                               //CTR block 3

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 0
        ld1     {v22.4s}, [x8], #16                               //load rk4

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 2
        ld1     {v23.4s}, [x8], #16                               //load rk5

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 0
        ld1     {v24.4s}, [x8], #16                               //load rk6

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 1
        ldr     q14, [x3, #80]                         //load h3l | h3h
#ifndef __AARCH64EB__
        ext     v14.16b, v14.16b, v14.16b, #8
#endif
        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 0
        ld1     {v25.4s}, [x8], #16                               //load rk7

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 1
        ld1     {v26.4s}, [x8], #16                               //load rk8

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 2
        ldr     q13, [x3, #64]                         //load h2l | h2h
#ifndef __AARCH64EB__
        ext     v13.16b, v13.16b, v13.16b, #8
#endif
        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 1
        ld1     {v27.4s}, [x8], #16                               //load rk9

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 2
        ldr     q15, [x3, #112]                        //load h4l | h4h
#ifndef __AARCH64EB__
        ext     v15.16b, v15.16b, v15.16b, #8
#endif
        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 3
        ld1     {v28.4s}, [x8], #16                              //load rk10

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 2
        ld1     {v29.4s}, [x8], #16                              //load rk11

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 3
        add     w12, w12, #1                            //CTR block 3

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 3

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 3
        ld1     { v11.16b}, [x3]
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 4

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 4

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 4

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 4

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 5

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 5

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 5

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 5

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 6
        trn2    v17.2d,  v14.2d,    v15.2d                      //h4l | h3l

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 6
        ld1     {v30.4s}, [x8], #16                              //load rk12

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 6
        ldr     q12, [x3, #32]                         //load h1l | h1h
#ifndef __AARCH64EB__
        ext     v12.16b, v12.16b, v12.16b, #8
#endif
        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 6
        ld1     {v31.4s}, [x8], #16                              //load rk13

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 7
        trn1    v9.2d, v14.2d,    v15.2d                      //h4h | h3h

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 7

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 7

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 7
        trn2    v16.2d,  v12.2d,    v13.2d                      //h2l | h1l

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 8

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 8

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 8

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 9

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 9

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 8

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 10

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 9

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 9

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 10

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 10

        aese    v1.16b, v29.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 11

        aese    v2.16b, v29.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 11

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 10

        aese    v1.16b, v30.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 12

        aese    v2.16b, v30.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 12

        aese    v0.16b, v29.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 11
        eor     v17.16b, v17.16b, v9.16b                  //h4k | h3k

        aese    v3.16b, v29.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 11

        aese    v2.16b, v31.16b                                     //AES block 2 - round 13
        trn1    v8.2d,    v12.2d,    v13.2d                      //h2h | h1h

        aese    v0.16b, v30.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 12

        aese    v3.16b, v30.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 12

        aese    v1.16b, v31.16b                                     //AES block 1 - round 13

        aese    v0.16b, v31.16b                                     //AES block 0 - round 13

        aese    v3.16b, v31.16b                                     //AES block 3 - round 13
        eor     v16.16b, v16.16b, v8.16b                     //h2k | h1k
        b.ge    .L256_enc_tail                                    //handle tail

        ldp     x19, x20, [x0, #16]           //AES block 1 - load plaintext
#ifdef __AARCH64EB__
        rev     x19, x19
        rev     x20, x20
#endif
        rev     w9, w12                                 //CTR block 4
        ldp     x6, x7, [x0, #0]            //AES block 0 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        ldp     x23, x24, [x0, #48]           //AES block 3 - load plaintext
#ifdef __AARCH64EB__
        rev     x23, x23
        rev     x24, x24
#endif
        ldp     x21, x22, [x0, #32]           //AES block 2 - load plaintext
#ifdef __AARCH64EB__
        rev     x21, x21
        rev     x22, x22
#endif
        add     x0, x0, #64                       //AES input_ptr update

        eor     x19, x19, x13                     //AES block 1 - round 14 low
        eor     x20, x20, x14                     //AES block 1 - round 14 high

        fmov    d5, x19                               //AES block 1 - mov low
        eor     x6, x6, x13                     //AES block 0 - round 14 low

        eor     x7, x7, x14                     //AES block 0 - round 14 high
        eor     x24, x24, x14                     //AES block 3 - round 14 high
        fmov    d4, x6                               //AES block 0 - mov low

        cmp     x0, x5                   //check if we have <= 8 blocks
        fmov    v4.d[1], x7                           //AES block 0 - mov high
        eor     x23, x23, x13                     //AES block 3 - round 14 low

        eor     x21, x21, x13                     //AES block 2 - round 14 low
        fmov    v5.d[1], x20                           //AES block 1 - mov high

        fmov    d6, x21                               //AES block 2 - mov low
        add     w12, w12, #1                            //CTR block 4

        orr     x9, x11, x9, lsl #32            //CTR block 4
        fmov    d7, x23                               //AES block 3 - mov low
        eor     x22, x22, x14                     //AES block 2 - round 14 high

        fmov    v6.d[1], x22                           //AES block 2 - mov high

        eor     v4.16b, v4.16b, v0.16b                          //AES block 0 - result
        fmov    d0, x10                               //CTR block 4

        fmov    v0.d[1], x9                               //CTR block 4
        rev     w9, w12                                 //CTR block 5
        add     w12, w12, #1                            //CTR block 5

        eor     v5.16b, v5.16b, v1.16b                          //AES block 1 - result
        fmov    d1, x10                               //CTR block 5
        orr     x9, x11, x9, lsl #32            //CTR block 5

        fmov    v1.d[1], x9                               //CTR block 5
        rev     w9, w12                                 //CTR block 6
        st1     { v4.16b}, [x2], #16                     //AES block 0 - store result

        fmov    v7.d[1], x24                           //AES block 3 - mov high
        orr     x9, x11, x9, lsl #32            //CTR block 6
        eor     v6.16b, v6.16b, v2.16b                          //AES block 2 - result

        st1     { v5.16b}, [x2], #16                     //AES block 1 - store result

        add     w12, w12, #1                            //CTR block 6
        fmov    d2, x10                               //CTR block 6

        fmov    v2.d[1], x9                               //CTR block 6
        st1     { v6.16b}, [x2], #16                     //AES block 2 - store result
        rev     w9, w12                                 //CTR block 7

        orr     x9, x11, x9, lsl #32            //CTR block 7

        eor     v7.16b, v7.16b, v3.16b                          //AES block 3 - result
        st1     { v7.16b}, [x2], #16                     //AES block 3 - store result
        b.ge    .L256_enc_prepretail                               //do prepretail

.L256_enc_main_loop:    //main  loop start
        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        rev64   v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        fmov    d3, x10                               //CTR block 4k+3

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        fmov    v3.d[1], x9                               //CTR block 4k+3

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        ldp     x23, x24, [x0, #48]           //AES block 4k+7 - load plaintext
#ifdef __AARCH64EB__
        rev     x23, x23
        rev     x24, x24
#endif
        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        ldp     x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
#ifdef __AARCH64EB__
        rev     x21, x21
        rev     x22, x22
#endif
        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
        eor     v4.16b, v4.16b, v11.16b                           //PRE 1

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        eor     x23, x23, x13                     //AES block 4k+7 - round 14 low

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        eor     x22, x22, x14                     //AES block 4k+6 - round 14 high
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
        rev64   v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)

        pmull2  v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)

        pmull   v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low

        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
        mov     d4, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        eor     v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4
        mov     d8, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
        eor     v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
        eor     v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        pmull   v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
        ins     v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid

        pmull2  v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high

        pmull   v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7

        pmull   v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
        ldp     x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
#ifdef __AARCH64EB__
        rev     x19, x19
        rev     x20, x20
#endif
        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
        mov     d4, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
        eor     v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low

        pmull2  v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid

        pmull2  v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
        eor     v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        eor     x19, x19, x13                     //AES block 4k+5 - round 14 low

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 9
        eor     v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
        eor     x21, x21, x13                     //AES block 4k+6 - round 14 low

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 9
        movi    v8.8b, #0xc2

        pmull   v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
        eor     v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
        fmov    d5, x19                               //AES block 4k+5 - mov low

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
        ldp     x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 10
        shl     d8, d8, #56               //mod_constant

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
        eor     v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 9

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 10
        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 9
        add     w12, w12, #1                            //CTR block 4k+3

        aese    v0.16b, v29.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 11
        eor     v4.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        aese    v1.16b, v29.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 11
        add     x0, x0, #64                       //AES input_ptr update

        pmull   v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
        rev     w9, w12                                 //CTR block 4k+8
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 10
        eor     x6, x6, x13                     //AES block 4k+4 - round 14 low

        aese    v1.16b, v30.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 12
        eor     v10.16b, v10.16b, v4.16b                         //MODULO - karatsuba tidy up

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 10
        eor     x7, x7, x14                     //AES block 4k+4 - round 14 high

        fmov    d4, x6                               //AES block 4k+4 - mov low
        orr     x9, x11, x9, lsl #32            //CTR block 4k+8
        eor     v7.16b, v9.16b, v7.16b                   //MODULO - fold into mid

        aese    v0.16b, v30.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 12
        eor     x20, x20, x14                     //AES block 4k+5 - round 14 high

        aese    v2.16b, v29.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 11
        eor     x24, x24, x14                     //AES block 4k+7 - round 14 high

        aese    v3.16b, v29.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 11
        add     w12, w12, #1                            //CTR block 4k+8

        aese    v0.16b, v31.16b                                     //AES block 4k+4 - round 13
        fmov    v4.d[1], x7                           //AES block 4k+4 - mov high
        eor     v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid

        aese    v2.16b, v30.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 12
        fmov    d7, x23                               //AES block 4k+7 - mov low

        aese    v1.16b, v31.16b                                     //AES block 4k+5 - round 13
        fmov    v5.d[1], x20                           //AES block 4k+5 - mov high

        fmov    d6, x21                               //AES block 4k+6 - mov low
        cmp     x0, x5                   //.LOOP CONTROL

        fmov    v6.d[1], x22                           //AES block 4k+6 - mov high

        pmull   v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
        eor     v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
        fmov    d0, x10                               //CTR block 4k+8

        fmov    v0.d[1], x9                               //CTR block 4k+8
        rev     w9, w12                                 //CTR block 4k+9
        add     w12, w12, #1                            //CTR block 4k+9

        eor     v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
        fmov    d1, x10                               //CTR block 4k+9
        orr     x9, x11, x9, lsl #32            //CTR block 4k+9

        aese    v3.16b, v30.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 12
        fmov    v1.d[1], x9                               //CTR block 4k+9

        aese    v2.16b, v31.16b                                     //AES block 4k+6 - round 13
        rev     w9, w12                                 //CTR block 4k+10
        st1     { v4.16b}, [x2], #16                     //AES block 4k+4 - store result

        orr     x9, x11, x9, lsl #32            //CTR block 4k+10
        eor     v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
        fmov    v7.d[1], x24                           //AES block 4k+7 - mov high

        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
        st1     { v5.16b}, [x2], #16                     //AES block 4k+5 - store result
        add     w12, w12, #1                            //CTR block 4k+10

        aese    v3.16b, v31.16b                                     //AES block 4k+7 - round 13
        eor     v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
        fmov    d2, x10                               //CTR block 4k+10

        st1     { v6.16b}, [x2], #16                     //AES block 4k+6 - store result
        fmov    v2.d[1], x9                               //CTR block 4k+10
        rev     w9, w12                                 //CTR block 4k+11

        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        orr     x9, x11, x9, lsl #32            //CTR block 4k+11

        eor     v7.16b, v7.16b, v3.16b                          //AES block 4k+7 - result
        st1     { v7.16b}, [x2], #16                     //AES block 4k+7 - store result
        b.lt    .L256_enc_main_loop

.L256_enc_prepretail:   //PREPRETAIL
        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        fmov    d3, x10                               //CTR block 4k+3

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        rev64   v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)

        fmov    v3.d[1], x9                               //CTR block 4k+3
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1

        eor     v4.16b, v4.16b, v11.16b                           //PRE 1
        rev64   v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid

        pmull2  v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high

        pmull   v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2

        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
        mov     d4, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
        eor     v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3

        eor     v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
        mov     d8, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        pmull   v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
        eor     v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
        add     w12, w12, #1                            //CTR block 4k+3

        pmull   v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid

        pmull2  v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high

        eor     v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
        ins     v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5

        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
        mov     d4, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4

        pmull2  v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid

        eor     v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid

        pmull2  v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        pmull   v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
        eor     v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
        movi    v8.8b, #0xc2

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
        eor     v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
        shl     d8, d8, #56               //mod_constant

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid

        pmull   v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 9

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        eor     v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 9

        eor     v10.16b, v10.16b, v9.16b                         //karatsuba tidy up

        pmull   v4.1q, v9.1d, v8.1d
        ext     v9.16b, v9.16b, v9.16b, #8

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 10

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        eor     v10.16b, v10.16b, v11.16b

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 10

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 9

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8

        aese    v1.16b, v29.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 11
        eor     v10.16b, v10.16b, v4.16b

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 10

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 9

        aese    v1.16b, v30.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 12

        aese    v0.16b, v29.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 11
        eor     v10.16b, v10.16b, v9.16b

        aese    v3.16b, v29.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 11

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 10

        aese    v0.16b, v30.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 12

        pmull   v4.1q, v10.1d, v8.1d

        aese    v2.16b, v29.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 11
        ext     v10.16b, v10.16b, v10.16b, #8

        aese    v3.16b, v30.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 12

        aese    v1.16b, v31.16b                                     //AES block 4k+5 - round 13
        eor     v11.16b, v11.16b, v4.16b

        aese    v2.16b, v30.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 12

        aese    v3.16b, v31.16b                                     //AES block 4k+7 - round 13

        aese    v0.16b, v31.16b                                     //AES block 4k+4 - round 13

        aese    v2.16b, v31.16b                                     //AES block 4k+6 - round 13
        eor     v11.16b, v11.16b, v10.16b
.L256_enc_tail: //TAIL

        ext     v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
        sub     x5, x4, x0   //main_end_input_ptr is number of bytes left to process
        ldp     x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        eor     x6, x6, x13                     //AES block 4k+4 - round 14 low
        eor     x7, x7, x14                     //AES block 4k+4 - round 14 high

        cmp     x5, #48
        fmov    d4, x6                               //AES block 4k+4 - mov low

        fmov    v4.d[1], x7                           //AES block 4k+4 - mov high

        eor     v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
        b.gt    .L256_enc_blocks_more_than_3

        cmp     x5, #32
        mov     v3.16b, v2.16b
        movi    v11.8b, #0

        movi    v9.8b, #0
        sub     w12, w12, #1

        mov     v2.16b, v1.16b
        movi    v10.8b, #0
        b.gt    .L256_enc_blocks_more_than_2

        mov     v3.16b, v1.16b
        sub     w12, w12, #1
        cmp     x5, #16

        b.gt    .L256_enc_blocks_more_than_1

        sub     w12, w12, #1
        b       .L256_enc_blocks_less_than_1
.L256_enc_blocks_more_than_3:   //blocks        left >  3
        st1     { v5.16b}, [x2], #16                    //AES final-3 block  - store result

        ldp     x6, x7, [x0], #16          //AES final-2 block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        rev64   v4.16b, v5.16b                                   //GHASH final-3 block

        eor     x6, x6, x13                    //AES final-2 block - round 14 low
        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        eor     x7, x7, x14                    //AES final-2 block - round 14 high

        mov     d22, v4.d[1]                                //GHASH final-3 block - mid
        fmov    d5, x6                                //AES final-2 block - mov low

        fmov    v5.d[1], x7                            //AES final-2 block - mov high

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
        movi    v8.8b, #0                                       //suppress further partial tag feed in

        mov     d10, v17.d[1]                              //GHASH final-3 block - mid

        pmull   v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low

        pmull2  v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high

        pmull   v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
        eor     v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
.L256_enc_blocks_more_than_2:   //blocks        left >  2

        st1     { v5.16b}, [x2], #16                    //AES final-2 block - store result

        ldp     x6, x7, [x0], #16          //AES final-1 block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        rev64   v4.16b, v5.16b                                   //GHASH final-2 block

        eor     x6, x6, x13                    //AES final-1 block - round 14 low
        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        fmov    d5, x6                                //AES final-1 block - mov low
        eor     x7, x7, x14                    //AES final-1 block - round 14 high

        fmov    v5.d[1], x7                            //AES final-1 block - mov high

        movi    v8.8b, #0                                       //suppress further partial tag feed in

        pmull2  v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
        mov     d22, v4.d[1]                                //GHASH final-2 block - mid

        pmull   v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid

        eor     v5.16b, v5.16b, v2.16b                           //AES final-1 block - result

        eor     v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high

        pmull   v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid

        eor     v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low

        eor     v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
.L256_enc_blocks_more_than_1:   //blocks        left >  1

        st1     { v5.16b}, [x2], #16                    //AES final-1 block - store result

        rev64   v4.16b, v5.16b                                   //GHASH final-1 block

        ldp     x6, x7, [x0], #16          //AES final block - load input low & high
#ifdef __AARCH64EB__
        rev     x6, x6
        rev     x7, x7
#endif
        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        movi    v8.8b, #0                                       //suppress further partial tag feed in

        eor     x6, x6, x13                    //AES final block - round 14 low
        mov     d22, v4.d[1]                                //GHASH final-1 block - mid

        pmull2  v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
        eor     x7, x7, x14                    //AES final block - round 14 high

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid

        eor     v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high

        ins     v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
        fmov    d5, x6                                //AES final block - mov low

        fmov    v5.d[1], x7                            //AES final block - mov high

        pmull2  v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid

        pmull   v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low

        eor     v5.16b, v5.16b, v3.16b                           //AES final block - result
        eor     v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid

        eor     v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
.L256_enc_blocks_less_than_1:   //blocks        left <= 1

        and     x1, x1, #127                   //bit_length %= 128

        mvn     x13, xzr                                     //rk14_l = 0xffffffffffffffff
        sub     x1, x1, #128                   //bit_length -= 128

        neg     x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
        ld1     { v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored

        mvn     x14, xzr                                     //rk14_h = 0xffffffffffffffff
        and     x1, x1, #127                   //bit_length %= 128

        lsr     x14, x14, x1                    //rk14_h is mask for top 64b of last block
        cmp     x1, #64

        csel    x6, x13, x14, lt
        csel    x7, x14, xzr, lt

        fmov    d0, x6                                //ctr0b is mask for last block

        fmov    v0.d[1], x7

        and     v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits

        rev64   v4.16b, v5.16b                                   //GHASH final block

        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        bif     v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing

        pmull2  v20.1q, v4.2d, v12.2d                         //GHASH final block - high
        mov     d8, v4.d[1]                                 //GHASH final block - mid
#ifndef __AARCH64EB__
        rev     w9, w12
#else
        mov     w9, w12
#endif

        pmull   v21.1q, v4.1d, v12.1d                         //GHASH final block - low

        eor     v9.16b, v9.16b, v20.16b                           //GHASH final block - high
        eor     v8.8b, v8.8b, v4.8b                         //GHASH final block - mid

        pmull   v8.1q, v8.1d, v16.1d                         //GHASH final block - mid

        eor     v11.16b, v11.16b, v21.16b                           //GHASH final block - low

        eor     v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
        movi    v8.8b, #0xc2

        eor     v4.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up

        shl     d8, d8, #56              //mod_constant

        eor     v10.16b, v10.16b, v4.16b                        //MODULO - karatsuba tidy up

        pmull   v7.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid

        ext     v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment

        eor     v10.16b, v10.16b, v7.16b                     //MODULO - fold into mid

        eor     v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid

        pmull   v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low

        ext     v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment

        str     w9, [x16, #12]                         //store the updated counter

        st1     { v5.16b}, [x2]                         //store all 16B
        eor     v11.16b, v11.16b, v9.16b                        //MODULO - fold into low

        eor     v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b
        mov     x0, x15
        st1     { v11.16b }, [x3]

        ldp     x21, x22, [sp, #16]
        ldp     x23, x24, [sp, #32]
        ldp     d8, d9, [sp, #48]
        ldp     d10, d11, [sp, #64]
        ldp     d12, d13, [sp, #80]
        ldp     d14, d15, [sp, #96]
        ldp     x19, x20, [sp], #112
        ret

.L256_enc_ret:
        mov     w0, #0x0
        ret
.size   aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
.globl  aes_gcm_dec_256_kernel
.type   aes_gcm_dec_256_kernel,%function
.align  4
aes_gcm_dec_256_kernel:
        cbz     x1, .L256_dec_ret
        stp     x19, x20, [sp, #-112]!
        mov     x16, x4
        mov     x8, x5
        stp     x21, x22, [sp, #16]
        stp     x23, x24, [sp, #32]
        stp     d8, d9, [sp, #48]
        stp     d10, d11, [sp, #64]
        stp     d12, d13, [sp, #80]
        stp     d14, d15, [sp, #96]

        lsr     x5, x1, #3              //byte_len
        mov     x15, x5
        ldp     x10, x11, [x16]              //ctr96_b64, ctr96_t32
#ifdef __AARCH64EB__
        rev     x10, x10
        rev     x11, x11
#endif
        ldp     x13, x14, [x8, #224]                     //load rk14
#ifdef __AARCH64EB__
        ror     x14, x14, #32
        ror     x13, x13, #32
#endif
        ld1     {v18.4s}, [x8], #16                               //load rk0
        sub     x5, x5, #1      //byte_len - 1

        ld1     {v19.4s}, [x8], #16                               //load rk1
        and     x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

        add     x4, x0, x1, lsr #3   //end_input_ptr
        ld1     {v20.4s}, [x8], #16                               //load rk2

        lsr     x12, x11, #32
        ld1     {v21.4s}, [x8], #16                               //load rk3
        orr     w11, w11, w11

        ld1     {v22.4s}, [x8], #16                               //load rk4
        add     x5, x5, x0
        rev     w12, w12                                //rev_ctr32

        add     w12, w12, #1                            //increment rev_ctr32
        fmov    d3, x10                               //CTR block 3

        rev     w9, w12                                 //CTR block 1
        add     w12, w12, #1                            //CTR block 1
        fmov    d1, x10                               //CTR block 1

        orr     x9, x11, x9, lsl #32            //CTR block 1
        ld1     { v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible

        fmov    v1.d[1], x9                               //CTR block 1
        rev     w9, w12                                 //CTR block 2
        add     w12, w12, #1                            //CTR block 2

        fmov    d2, x10                               //CTR block 2
        orr     x9, x11, x9, lsl #32            //CTR block 2

        fmov    v2.d[1], x9                               //CTR block 2
        rev     w9, w12                                 //CTR block 3

        orr     x9, x11, x9, lsl #32            //CTR block 3
        ld1     {v23.4s}, [x8], #16                               //load rk5

        fmov    v3.d[1], x9                               //CTR block 3
        add     w12, w12, #1                            //CTR block 3

        ld1     {v24.4s}, [x8], #16                               //load rk6

        ld1     {v25.4s}, [x8], #16                               //load rk7

        ld1     {v26.4s}, [x8], #16                               //load rk8

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 0
        ldr     q14, [x3, #80]                         //load h3l | h3h
#ifndef __AARCH64EB__
        ext     v14.16b, v14.16b, v14.16b, #8
#endif

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 0
        ldr     q15, [x3, #112]                        //load h4l | h4h
#ifndef __AARCH64EB__
        ext     v15.16b, v15.16b, v15.16b, #8
#endif

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 0
        ldr     q13, [x3, #64]                         //load h2l | h2h
#ifndef __AARCH64EB__
        ext     v13.16b, v13.16b, v13.16b, #8
#endif

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 0
        ld1     {v27.4s}, [x8], #16                                 //load rk9

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 1

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 1
        ld1     { v11.16b}, [x3]
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 1
        ld1     {v28.4s}, [x8], #16                              //load rk10

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 1
        ld1     {v29.4s}, [x8], #16                              //load rk11

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 2
        ldr     q12, [x3, #32]                         //load h1l | h1h
#ifndef __AARCH64EB__
        ext     v12.16b, v12.16b, v12.16b, #8
#endif
        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 2
        ld1     {v30.4s}, [x8], #16                              //load rk12

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 2

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 3

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 2

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 3

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 4
        cmp     x0, x5                   //check if we have <= 4 blocks

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 3

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 3

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 4

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 4

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 4

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 5

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 5

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 5

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 5

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 6

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 6

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 6

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 6

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 7

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 7

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 7

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 8

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 7

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 8

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 8

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 9

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 8
        ld1     {v31.4s}, [x8], #16                             //load rk13

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 9

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 10

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 9

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 10

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 9

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 10

        aese    v0.16b, v29.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 11

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 10

        aese    v3.16b, v29.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 11

        aese    v1.16b, v29.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 11

        aese    v2.16b, v29.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 11

        trn1    v9.2d, v14.2d,    v15.2d                      //h4h | h3h

        trn2    v17.2d,  v14.2d,    v15.2d                      //h4l | h3l

        trn1    v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
        trn2    v16.2d,  v12.2d,    v13.2d                      //h2l | h1l

        aese    v1.16b, v30.16b
        aesmc   v1.16b, v1.16b          //AES block 1 - round 12

        aese    v0.16b, v30.16b
        aesmc   v0.16b, v0.16b          //AES block 0 - round 12

        aese    v2.16b, v30.16b
        aesmc   v2.16b, v2.16b          //AES block 2 - round 12

        aese    v3.16b, v30.16b
        aesmc   v3.16b, v3.16b          //AES block 3 - round 12
        eor     v17.16b, v17.16b, v9.16b                  //h4k | h3k

        aese    v1.16b, v31.16b                                     //AES block 1 - round 13

        aese    v2.16b, v31.16b                                     //AES block 2 - round 13
        eor     v16.16b, v16.16b, v8.16b                     //h2k | h1k

        aese    v3.16b, v31.16b                                     //AES block 3 - round 13

        aese    v0.16b, v31.16b                                     //AES block 0 - round 13
        b.ge    .L256_dec_tail                                    //handle tail

        ld1     {v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext

        rev     w9, w12                                 //CTR block 4

        eor     v0.16b, v4.16b, v0.16b                            //AES block 0 - result

        eor     v1.16b, v5.16b, v1.16b                            //AES block 1 - result
        rev64   v5.16b, v5.16b                                    //GHASH block 1
        ld1     {v6.16b}, [x0], #16                       //AES block 2 - load ciphertext

        mov     x7, v0.d[1]                            //AES block 0 - mov high

        mov     x6, v0.d[0]                            //AES block 0 - mov low
        rev64   v4.16b, v4.16b                                    //GHASH block 0
        add     w12, w12, #1                            //CTR block 4

        fmov    d0, x10                               //CTR block 4
        orr     x9, x11, x9, lsl #32            //CTR block 4

        fmov    v0.d[1], x9                               //CTR block 4
        rev     w9, w12                                 //CTR block 5
        add     w12, w12, #1                            //CTR block 5

        mov     x19, v1.d[0]                            //AES block 1 - mov low

        orr     x9, x11, x9, lsl #32            //CTR block 5
        mov     x20, v1.d[1]                            //AES block 1 - mov high
        eor     x7, x7, x14                   //AES block 0 - round 14 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        eor     x6, x6, x13                   //AES block 0 - round 14 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        stp     x6, x7, [x2], #16        //AES block 0 - store result
        fmov    d1, x10                               //CTR block 5

        ld1     {v7.16b}, [x0], #16                       //AES block 3 - load ciphertext

        fmov    v1.d[1], x9                               //CTR block 5
        rev     w9, w12                                 //CTR block 6
        add     w12, w12, #1                            //CTR block 6

        eor     x19, x19, x13                   //AES block 1 - round 14 low
#ifdef __AARCH64EB__
        rev     x19, x19
#endif
        orr     x9, x11, x9, lsl #32            //CTR block 6

        eor     x20, x20, x14                   //AES block 1 - round 14 high
#ifdef __AARCH64EB__
        rev     x20, x20
#endif
        stp     x19, x20, [x2], #16        //AES block 1 - store result

        eor     v2.16b, v6.16b, v2.16b                            //AES block 2 - result
        cmp     x0, x5                   //check if we have <= 8 blocks
        b.ge    .L256_dec_prepretail                              //do prepretail

.L256_dec_main_loop:    //main  loop start
        mov     x21, v2.d[0]                            //AES block 4k+2 - mov low
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
        eor     v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        mov     x22, v2.d[1]                            //AES block 4k+2 - mov high

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        fmov    d2, x10                               //CTR block 4k+6

        fmov    v2.d[1], x9                               //CTR block 4k+6
        eor     v4.16b, v4.16b, v11.16b                           //PRE 1
        rev     w9, w12                                 //CTR block 4k+7

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        mov     x24, v3.d[1]                            //AES block 4k+3 - mov high

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        mov     x23, v3.d[0]                            //AES block 4k+3 - mov low

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid
        fmov    d3, x10                               //CTR block 4k+7

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
        orr     x9, x11, x9, lsl #32            //CTR block 4k+7

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        fmov    v3.d[1], x9                               //CTR block 4k+7

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
        eor     x22, x22, x14                   //AES block 4k+2 - round 14 high
#ifdef __AARCH64EB__
        rev     x22, x22
#endif
        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
        eor     x21, x21, x13                   //AES block 4k+2 - round 14 low
#ifdef __AARCH64EB__
        rev     x21, x21
#endif
        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
        stp     x21, x22, [x2], #16        //AES block 4k+2 - store result

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low

        pmull2  v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3
        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
        eor     x23, x23, x13                   //AES block 4k+3 - round 14 low
#ifdef __AARCH64EB__
        rev     x23, x23
#endif
        pmull   v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
        eor     x24, x24, x14                   //AES block 4k+3 - round 14 high
#ifdef __AARCH64EB__
        rev     x24, x24
#endif
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
        mov     d4, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
        eor     v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
        add     w12, w12, #1                            //CTR block 4k+7

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        mov     d8, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4
        eor     v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid

        pmull   v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
        eor     v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        eor     v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low

        pmull   v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
        rev     w9, w12                                 //CTR block 4k+8

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
        ins     v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
        add     w12, w12, #1                            //CTR block 4k+8

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7

        pmull2  v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
        mov     d6, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5

        pmull2  v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6

        pmull   v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
        orr     x9, x11, x9, lsl #32            //CTR block 4k+8
        eor     v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid

        pmull2  v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 9
        eor     v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
        eor     v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 10

        pmull   v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
        movi    v8.8b, #0xc2

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        eor     v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low

        aese    v0.16b, v29.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 11

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
        shl     d8, d8, #56               //mod_constant

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
        eor     v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid

        aese    v0.16b, v30.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 12

        pmull   v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
        eor     v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 9
        ld1     {v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext

        aese    v0.16b, v31.16b                                     //AES block 4k+4 - round 13
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 10
        eor     v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 9
        ld1     {v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
        eor     v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result

        aese    v1.16b, v29.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 11
        stp     x23, x24, [x2], #16        //AES block 4k+3 - store result

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 10
        eor     v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 9
        ld1     {v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext

        aese    v1.16b, v30.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 12
        ld1     {v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext

        aese    v2.16b, v29.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 11
        mov     x7, v0.d[1]                            //AES block 4k+4 - mov high

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 10
        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        aese    v1.16b, v31.16b                                     //AES block 4k+5 - round 13
        mov     x6, v0.d[0]                            //AES block 4k+4 - mov low

        aese    v2.16b, v30.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 12
        fmov    d0, x10                               //CTR block 4k+8

        aese    v3.16b, v29.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 11
        fmov    v0.d[1], x9                               //CTR block 4k+8

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
        eor     v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
        rev     w9, w12                                 //CTR block 4k+9

        aese    v2.16b, v31.16b                                     //AES block 4k+6 - round 13
        orr     x9, x11, x9, lsl #32            //CTR block 4k+9
        cmp     x0, x5                   //.LOOP CONTROL

        add     w12, w12, #1                            //CTR block 4k+9

        eor     x6, x6, x13                   //AES block 4k+4 - round 14 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     x7, x7, x14                   //AES block 4k+4 - round 14 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        mov     x20, v1.d[1]                            //AES block 4k+5 - mov high
        eor     v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        aese    v3.16b, v30.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 12
        mov     x19, v1.d[0]                            //AES block 4k+5 - mov low

        fmov    d1, x10                               //CTR block 4k+9
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        fmov    v1.d[1], x9                               //CTR block 4k+9
        rev     w9, w12                                 //CTR block 4k+10
        add     w12, w12, #1                            //CTR block 4k+10

        aese    v3.16b, v31.16b                                     //AES block 4k+7 - round 13
        orr     x9, x11, x9, lsl #32            //CTR block 4k+10

        rev64   v5.16b, v5.16b                                    //GHASH block 4k+5
        eor     x20, x20, x14                   //AES block 4k+5 - round 14 high
#ifdef __AARCH64EB__
        rev     x20, x20
#endif
        stp     x6, x7, [x2], #16        //AES block 4k+4 - store result

        eor     x19, x19, x13                   //AES block 4k+5 - round 14 low
#ifdef __AARCH64EB__
        rev     x19, x19
#endif
        stp     x19, x20, [x2], #16        //AES block 4k+5 - store result

        rev64   v4.16b, v4.16b                                    //GHASH block 4k+4
        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        b.lt    .L256_dec_main_loop


.L256_dec_prepretail:   //PREPRETAIL
        ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
        mov     x21, v2.d[0]                            //AES block 4k+2 - mov low
        eor     v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result

        aese    v0.16b, v18.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
        mov     x22, v2.d[1]                            //AES block 4k+2 - mov high

        aese    v1.16b, v18.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
        fmov    d2, x10                               //CTR block 4k+6

        fmov    v2.d[1], x9                               //CTR block 4k+6
        rev     w9, w12                                 //CTR block 4k+7
        eor     v4.16b, v4.16b, v11.16b                           //PRE 1

        rev64   v6.16b, v6.16b                                    //GHASH block 4k+2
        orr     x9, x11, x9, lsl #32            //CTR block 4k+7
        mov     x23, v3.d[0]                            //AES block 4k+3 - mov low

        aese    v1.16b, v19.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
        mov     x24, v3.d[1]                            //AES block 4k+3 - mov high

        pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
        mov     d8, v4.d[1]                                  //GHASH block 4k - mid
        fmov    d3, x10                               //CTR block 4k+7

        pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
        fmov    v3.d[1], x9                               //CTR block 4k+7

        aese    v2.16b, v18.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
        mov     d10, v17.d[1]                               //GHASH block 4k - mid

        aese    v0.16b, v19.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
        eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid

        pmull2  v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high

        aese    v2.16b, v19.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
        rev64   v7.16b, v7.16b                                    //GHASH block 4k+3

        aese    v3.16b, v18.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0

        pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high

        pmull   v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low

        aese    v3.16b, v19.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
        mov     d4, v5.d[1]                                  //GHASH block 4k+1 - mid

        aese    v0.16b, v20.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2

        aese    v1.16b, v20.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
        eor     v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low

        aese    v2.16b, v20.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2

        aese    v0.16b, v21.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
        mov     d8, v6.d[1]                                  //GHASH block 4k+2 - mid

        aese    v3.16b, v20.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
        eor     v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid

        pmull   v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low

        aese    v0.16b, v22.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4

        aese    v3.16b, v21.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
        eor     v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid

        pmull   v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid

        aese    v0.16b, v23.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
        eor     v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low

        aese    v3.16b, v22.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4

        pmull2  v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
        eor     v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid

        pmull2  v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high

        aese    v3.16b, v23.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
        ins     v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid

        aese    v2.16b, v21.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3

        aese    v1.16b, v21.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
        eor     v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high

        pmull   v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low

        aese    v2.16b, v22.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
        mov     d6, v7.d[1]                                  //GHASH block 4k+3 - mid

        aese    v1.16b, v22.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4

        pmull2  v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid

        aese    v2.16b, v23.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
        eor     v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid

        aese    v1.16b, v23.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5

        aese    v3.16b, v24.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
        eor     v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid

        aese    v2.16b, v24.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6

        aese    v0.16b, v24.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
        movi    v8.8b, #0xc2

        aese    v1.16b, v24.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
        eor     v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low

        pmull   v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid

        aese    v3.16b, v25.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
        eor     v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high

        aese    v1.16b, v25.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7

        aese    v0.16b, v25.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
        eor     v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid

        aese    v3.16b, v26.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8

        aese    v2.16b, v25.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
        eor     v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        aese    v1.16b, v26.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8

        aese    v0.16b, v26.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
        shl     d8, d8, #56               //mod_constant

        aese    v2.16b, v26.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8

        aese    v1.16b, v27.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 9
        eor     v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up

        pmull   v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid

        aese    v2.16b, v27.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 9
        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        aese    v3.16b, v27.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 9

        aese    v0.16b, v27.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 9
        eor     v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid

        aese    v2.16b, v28.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 10

        aese    v3.16b, v28.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 10

        aese    v0.16b, v28.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 10
        eor     x22, x22, x14                   //AES block 4k+2 - round 14 high
#ifdef __AARCH64EB__
        rev     x22, x22
#endif
        aese    v1.16b, v28.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 10
        eor     x23, x23, x13                   //AES block 4k+3 - round 14 low
#ifdef __AARCH64EB__
        rev     x23, x23
#endif
        aese    v2.16b, v29.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 11
        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        aese    v0.16b, v29.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 11
        add     w12, w12, #1                            //CTR block 4k+7

        aese    v1.16b, v29.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 11
        eor     x21, x21, x13                   //AES block 4k+2 - round 14 low
#ifdef __AARCH64EB__
        rev     x21, x21
#endif

        aese    v2.16b, v30.16b
        aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 12

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
        eor     x24, x24, x14                   //AES block 4k+3 - round 14 high
#ifdef __AARCH64EB__
        rev     x24, x24
#endif

        aese    v3.16b, v29.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 11
        stp     x21, x22, [x2], #16        //AES block 4k+2 - store result

        aese    v1.16b, v30.16b
        aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 12
        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        aese    v0.16b, v30.16b
        aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 12
        stp     x23, x24, [x2], #16        //AES block 4k+3 - store result

        aese    v3.16b, v30.16b
        aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 12
        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        aese    v1.16b, v31.16b                                     //AES block 4k+5 - round 13

        aese    v0.16b, v31.16b                                     //AES block 4k+4 - round 13

        aese    v3.16b, v31.16b                                     //AES block 4k+7 - round 13

        aese    v2.16b, v31.16b                                     //AES block 4k+6 - round 13
        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
.L256_dec_tail: //TAIL

        sub     x5, x4, x0   //main_end_input_ptr is number of bytes left to process
        ld1     { v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext

        eor     v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result

        mov     x6, v0.d[0]                            //AES block 4k+4 - mov low

        mov     x7, v0.d[1]                            //AES block 4k+4 - mov high
        ext     v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag

        cmp     x5, #48

        eor     x6, x6, x13                   //AES block 4k+4 - round 14 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif

        eor     x7, x7, x14                   //AES block 4k+4 - round 14 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
        b.gt    .L256_dec_blocks_more_than_3

        sub     w12, w12, #1
        mov     v3.16b, v2.16b
        movi    v10.8b, #0

        movi    v11.8b, #0
        cmp     x5, #32

        movi    v9.8b, #0
        mov     v2.16b, v1.16b
        b.gt    .L256_dec_blocks_more_than_2

        sub     w12, w12, #1

        mov     v3.16b, v1.16b
        cmp     x5, #16
        b.gt    .L256_dec_blocks_more_than_1

        sub     w12, w12, #1
        b       .L256_dec_blocks_less_than_1
.L256_dec_blocks_more_than_3:   //blocks        left >  3
        rev64   v4.16b, v5.16b                                   //GHASH final-3 block
        ld1     { v5.16b}, [x0], #16                     //AES final-2 block - load ciphertext

        stp     x6, x7, [x2], #16       //AES final-3 block  - store result

        mov     d10, v17.d[1]                              //GHASH final-3 block - mid

        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag

        eor     v0.16b, v5.16b, v1.16b                           //AES final-2 block - result

        mov     d22, v4.d[1]                                //GHASH final-3 block - mid

        mov     x6, v0.d[0]                           //AES final-2 block - mov low

        mov     x7, v0.d[1]                           //AES final-2 block - mov high

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid

        movi    v8.8b, #0                                       //suppress further partial tag feed in

        pmull2  v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high

        pmull   v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
        eor     x6, x6, x13                  //AES final-2 block - round 14 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif

        pmull   v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
        eor     x7, x7, x14                  //AES final-2 block - round 14 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
.L256_dec_blocks_more_than_2:   //blocks        left >  2

        rev64   v4.16b, v5.16b                                   //GHASH final-2 block
        ld1     { v5.16b}, [x0], #16                     //AES final-1 block - load ciphertext

        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag
        stp     x6, x7, [x2], #16       //AES final-2 block  - store result

        eor     v0.16b, v5.16b, v2.16b                           //AES final-1 block - result

        mov     d22, v4.d[1]                                //GHASH final-2 block - mid

        pmull   v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low

        pmull2  v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
        mov     x6, v0.d[0]                           //AES final-1 block - mov low

        mov     x7, v0.d[1]                           //AES final-1 block - mov high
        eor     v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
        movi    v8.8b, #0                                       //suppress further partial tag feed in

        pmull   v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid

        eor     v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
        eor     x6, x6, x13                  //AES final-1 block - round 14 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif

        eor     v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
        eor     x7, x7, x14                  //AES final-1 block - round 14 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
.L256_dec_blocks_more_than_1:   //blocks        left >  1

        stp     x6, x7, [x2], #16       //AES final-1 block  - store result
        rev64   v4.16b, v5.16b                                   //GHASH final-1 block

        ld1     { v5.16b}, [x0], #16                     //AES final block - load ciphertext

        eor     v4.16b, v4.16b, v8.16b                          //feed in partial tag
        movi    v8.8b, #0                                       //suppress further partial tag feed in

        mov     d22, v4.d[1]                                //GHASH final-1 block - mid

        eor     v0.16b, v5.16b, v3.16b                           //AES final block - result

        pmull2  v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high

        eor     v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid

        pmull   v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
        mov     x6, v0.d[0]                           //AES final block - mov low

        ins     v22.d[1], v22.d[0]                           //GHASH final-1 block - mid

        mov     x7, v0.d[1]                           //AES final block - mov high

        pmull2  v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
        eor     x6, x6, x13                  //AES final block - round 14 low
#ifdef __AARCH64EB__
        rev     x6, x6
#endif
        eor     v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low

        eor     v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high

        eor     v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
        eor     x7, x7, x14                  //AES final block - round 14 high
#ifdef __AARCH64EB__
        rev     x7, x7
#endif
.L256_dec_blocks_less_than_1:   //blocks        left <= 1

        and     x1, x1, #127                   //bit_length %= 128
        mvn     x14, xzr                                     //rk14_h = 0xffffffffffffffff

        sub     x1, x1, #128                   //bit_length -= 128
        mvn     x13, xzr                                     //rk14_l = 0xffffffffffffffff

        ldp     x4, x5, [x2] //load existing bytes we need to not overwrite
        neg     x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])

        and     x1, x1, #127                   //bit_length %= 128

        lsr     x14, x14, x1                    //rk14_h is mask for top 64b of last block
        cmp     x1, #64

        csel    x9, x13, x14, lt
        csel    x10, x14, xzr, lt

        fmov    d0, x9                                  //ctr0b is mask for last block
        and     x6, x6, x9

        mov     v0.d[1], x10
        bic     x4, x4, x9          //mask out low existing bytes

#ifndef __AARCH64EB__
        rev     w9, w12
#else
        mov     w9, w12
#endif

        bic     x5, x5, x10      //mask out high existing bytes

        orr     x6, x6, x4

        and     x7, x7, x10

        orr     x7, x7, x5

        and     v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits

        rev64   v4.16b, v5.16b                                    //GHASH final block

        eor     v4.16b, v4.16b, v8.16b                           //feed in partial tag

        pmull   v21.1q, v4.1d, v12.1d                          //GHASH final block - low

        mov     d8, v4.d[1]                                  //GHASH final block - mid

        eor     v8.8b, v8.8b, v4.8b                          //GHASH final block - mid

        pmull2  v20.1q, v4.2d, v12.2d                          //GHASH final block - high

        pmull   v8.1q, v8.1d, v16.1d                          //GHASH final block - mid

        eor     v9.16b, v9.16b, v20.16b                            //GHASH final block - high

        eor     v11.16b, v11.16b, v21.16b                            //GHASH final block - low

        eor     v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
        movi    v8.8b, #0xc2

        eor     v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up

        shl     d8, d8, #56               //mod_constant

        eor     v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up

        pmull   v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid

        ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment

        eor     v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid

        eor     v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid

        pmull   v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low

        ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment

        eor     v11.16b, v11.16b, v8.16b               //MODULO - fold into low

        stp     x6, x7, [x2]

        str     w9, [x16, #12]                          //store the updated counter

        eor     v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
        ext     v11.16b, v11.16b, v11.16b, #8
        rev64   v11.16b, v11.16b
        mov     x0, x15
        st1     { v11.16b }, [x3]

        ldp     x21, x22, [sp, #16]
        ldp     x23, x24, [sp, #32]
        ldp     d8, d9, [sp, #48]
        ldp     d10, d11, [sp, #64]
        ldp     d12, d13, [sp, #80]
        ldp     d14, d15, [sp, #96]
        ldp     x19, x20, [sp], #112
        ret

.L256_dec_ret:
        mov     w0, #0x0
        ret
.size   aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
.byte   71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align  2
.align  2
#endif
